sim/aarch64/simulator.c

   1 /* simulator.c -- Interface for the AArch64 simulator.
   2
   3    Copyright (C) 2015-2017 Free Software Foundation, Inc.
   4
   5    Contributed by Red Hat.
   6
   7    This file is part of GDB.
   8
   9    This program is free software; you can redistribute it and/or modify
  10    it under the terms of the GNU General Public License as published by
  11    the Free Software Foundation; either version 3 of the License, or
  12    (at your option) any later version.
  13
  14    This program is distributed in the hope that it will be useful,
  15    but WITHOUT ANY WARRANTY; without even the implied warranty of
  16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17    GNU General Public License for more details.
  18
  19    You should have received a copy of the GNU General Public License
  20    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include <stdlib.h>
  24 #include <stdio.h>
  25 #include <string.h>
  26 #include <sys/types.h>
  27 #include <math.h>
  28 #include <time.h>
  29 #include <limits.h>
  30
  31 #include "simulator.h"
  32 #include "cpustate.h"
  33 #include "memory.h"
  34
  35 #define NO_SP 0
  36 #define SP_OK 1
  37
  38 #define TST(_flag)   (aarch64_test_CPSR_bit (cpu, _flag))
  39 #define IS_SET(_X)   (TST (( _X )) ? 1 : 0)
  40 #define IS_CLEAR(_X) (TST (( _X )) ? 0 : 1)
  41
  42 /* Space saver macro.  */
  43 #define INSTR(HIGH, LOW) uimm (aarch64_get_instr (cpu), (HIGH), (LOW))
  44
  45 #define HALT_UNALLOC                                                    \
  46   do                                                                    \
  47     {                                                                   \
  48       TRACE_DISASM (cpu, aarch64_get_PC (cpu));                         \
  49       TRACE_INSN (cpu,                                                  \
  50                   "Unallocated instruction detected at sim line %d,"    \
  51                   " exe addr %" PRIx64,                                 \
  52                   __LINE__, aarch64_get_PC (cpu));                      \
  53       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),\
  54                        sim_stopped, SIM_SIGILL);                        \
  55     }                                                                   \
  56   while (0)
  57
  58 #define HALT_NYI                                                        \
  59   do                                                                    \
  60     {                                                                   \
  61       TRACE_DISASM (cpu, aarch64_get_PC (cpu));                         \
  62       TRACE_INSN (cpu,                                                  \
  63                   "Unimplemented instruction detected at sim line %d,"  \
  64                   " exe addr %" PRIx64,                                 \
  65                   __LINE__, aarch64_get_PC (cpu));                      \
  66       if (! TRACE_ANY_P (cpu))                                          \
  67         sim_io_eprintf (CPU_STATE (cpu), "SIM Error: Unimplemented instruction: %#08x\n", \
  68                         aarch64_get_instr (cpu));                       \
  69       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),\
  70                        sim_stopped, SIM_SIGABRT);                       \
  71     }                                                                   \
  72   while (0)
  73
  74 #define NYI_assert(HI, LO, EXPECTED)                                    \
  75   do                                                                    \
  76     {                                                                   \
  77       if (INSTR ((HI), (LO)) != (EXPECTED))                             \
  78         HALT_NYI;                                                       \
  79     }                                                                   \
  80   while (0)
  81
  82 /* Helper functions used by expandLogicalImmediate.  */
  83
  84 /* for i = 1, ... N result<i-1> = 1 other bits are zero  */
  85 static inline uint64_t
  86 ones (int N)
  87 {
  88   return (N == 64 ? (uint64_t)-1UL : ((1UL << N) - 1));
  89 }
  90
  91 /* result<0> to val<N>  */
  92 static inline uint64_t
  93 pickbit (uint64_t val, int N)
  94 {
  95   return pickbits64 (val, N, N);
  96 }
  97
  98 static uint64_t
  99 expand_logical_immediate (uint32_t S, uint32_t R, uint32_t N)
 100 {
 101   uint64_t mask;
 102   uint64_t imm;
 103   unsigned simd_size;
 104
 105   /* The immediate value is S+1 bits to 1, left rotated by SIMDsize - R
 106      (in other words, right rotated by R), then replicated. */
 107   if (N != 0)
 108     {
 109       simd_size = 64;
 110       mask = 0xffffffffffffffffull;
 111     }
 112   else
 113     {
 114       switch (S)
 115         {
 116         case 0x00 ... 0x1f: /* 0xxxxx */ simd_size = 32;           break;
 117         case 0x20 ... 0x2f: /* 10xxxx */ simd_size = 16; S &= 0xf; break;
 118         case 0x30 ... 0x37: /* 110xxx */ simd_size =  8; S &= 0x7; break;
 119         case 0x38 ... 0x3b: /* 1110xx */ simd_size =  4; S &= 0x3; break;
 120         case 0x3c ... 0x3d: /* 11110x */ simd_size =  2; S &= 0x1; break;
 121         default: return 0;
 122         }
 123       mask = (1ull << simd_size) - 1;
 124       /* Top bits are IGNORED.  */
 125       R &= simd_size - 1;
 126     }
 127
 128   /* NOTE: if S = simd_size - 1 we get 0xf..f which is rejected.  */
 129   if (S == simd_size - 1)
 130     return 0;
 131
 132   /* S+1 consecutive bits to 1.  */
 133   /* NOTE: S can't be 63 due to detection above.  */
 134   imm = (1ull << (S + 1)) - 1;
 135
 136   /* Rotate to the left by simd_size - R.  */
 137   if (R != 0)
 138     imm = ((imm << (simd_size - R)) & mask) | (imm >> R);
 139
 140   /* Replicate the value according to SIMD size.  */
 141   switch (simd_size)
 142     {
 143     case  2: imm = (imm <<  2) | imm;
 144     case  4: imm = (imm <<  4) | imm;
 145     case  8: imm = (imm <<  8) | imm;
 146     case 16: imm = (imm << 16) | imm;
 147     case 32: imm = (imm << 32) | imm;
 148     case 64: break;
 149     default: return 0;
 150     }
 151
 152   return imm;
 153 }
 154
 155 /* Instr[22,10] encodes N immr and imms. we want a lookup table
 156    for each possible combination i.e. 13 bits worth of int entries.  */
 157 #define  LI_TABLE_SIZE  (1 << 13)
 158 static uint64_t LITable[LI_TABLE_SIZE];
 159
 160 void
 161 aarch64_init_LIT_table (void)
 162 {
 163   unsigned index;
 164
 165   for (index = 0; index < LI_TABLE_SIZE; index++)
 166     {
 167       uint32_t N    = uimm (index, 12, 12);
 168       uint32_t immr = uimm (index, 11, 6);
 169       uint32_t imms = uimm (index, 5, 0);
 170
 171       LITable [index] = expand_logical_immediate (imms, immr, N);
 172     }
 173 }
 174
 175 static void
 176 dexNotify (sim_cpu *cpu)
 177 {
 178   /* instr[14,0] == type : 0 ==> method entry, 1 ==> method reentry
 179                            2 ==> exit Java, 3 ==> start next bytecode.  */
 180   uint32_t type = INSTR (14, 0);
 181
 182   TRACE_EVENTS (cpu, "Notify Insn encountered, type = 0x%x", type);
 183
 184   switch (type)
 185     {
 186     case 0:
 187       /* aarch64_notifyMethodEntry (aarch64_get_reg_u64 (cpu, R23, 0),
 188          aarch64_get_reg_u64 (cpu, R22, 0));  */
 189       break;
 190     case 1:
 191       /* aarch64_notifyMethodReentry (aarch64_get_reg_u64 (cpu, R23, 0),
 192          aarch64_get_reg_u64 (cpu, R22, 0));  */
 193       break;
 194     case 2:
 195       /* aarch64_notifyMethodExit ();  */
 196       break;
 197     case 3:
 198       /* aarch64_notifyBCStart (aarch64_get_reg_u64 (cpu, R23, 0),
 199          aarch64_get_reg_u64 (cpu, R22, 0));  */
 200       break;
 201     }
 202 }
 203
 204 /* secondary decode within top level groups  */
 205
 206 static void
 207 dexPseudo (sim_cpu *cpu)
 208 {
 209   /* assert instr[28,27] = 00
 210
 211      We provide 2 pseudo instructions:
 212
 213      HALT stops execution of the simulator causing an immediate
 214      return to the x86 code which entered it.
 215
 216      CALLOUT initiates recursive entry into x86 code.  A register
 217      argument holds the address of the x86 routine.  Immediate
 218      values in the instruction identify the number of general
 219      purpose and floating point register arguments to be passed
 220      and the type of any value to be returned.  */
 221
 222   uint32_t PSEUDO_HALT      =  0xE0000000U;
 223   uint32_t PSEUDO_CALLOUT   =  0x00018000U;
 224   uint32_t PSEUDO_CALLOUTR  =  0x00018001U;
 225   uint32_t PSEUDO_NOTIFY    =  0x00014000U;
 226   uint32_t dispatch;
 227
 228   if (aarch64_get_instr (cpu) == PSEUDO_HALT)
 229     {
 230       TRACE_EVENTS (cpu, " Pseudo Halt Instruction");
 231       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
 232                        sim_stopped, SIM_SIGTRAP);
 233     }
 234
 235   dispatch = INSTR (31, 15);
 236
 237   /* We do not handle callouts at the moment.  */
 238   if (dispatch == PSEUDO_CALLOUT || dispatch == PSEUDO_CALLOUTR)
 239     {
 240       TRACE_EVENTS (cpu, " Callout");
 241       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
 242                        sim_stopped, SIM_SIGABRT);
 243     }
 244
 245   else if (dispatch == PSEUDO_NOTIFY)
 246     dexNotify (cpu);
 247
 248   else
 249     HALT_UNALLOC;
 250 }
 251
 252 /* Load-store single register (unscaled offset)
 253    These instructions employ a base register plus an unscaled signed
 254    9 bit offset.
 255
 256    N.B. the base register (source) can be Xn or SP. all other
 257    registers may not be SP.  */
 258
 259 /* 32 bit load 32 bit unscaled signed 9 bit.  */
 260 static void
 261 ldur32 (sim_cpu *cpu, int32_t offset)
 262 {
 263   unsigned rn = INSTR (9, 5);
 264   unsigned rt = INSTR (4, 0);
 265
 266   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 267   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32
 268                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 269                         + offset));
 270 }
 271
 272 /* 64 bit load 64 bit unscaled signed 9 bit.  */
 273 static void
 274 ldur64 (sim_cpu *cpu, int32_t offset)
 275 {
 276   unsigned rn = INSTR (9, 5);
 277   unsigned rt = INSTR (4, 0);
 278
 279   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 280   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64
 281                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 282                         + offset));
 283 }
 284
 285 /* 32 bit load zero-extended byte unscaled signed 9 bit.  */
 286 static void
 287 ldurb32 (sim_cpu *cpu, int32_t offset)
 288 {
 289   unsigned rn = INSTR (9, 5);
 290   unsigned rt = INSTR (4, 0);
 291
 292   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 293   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8
 294                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 295                         + offset));
 296 }
 297
 298 /* 32 bit load sign-extended byte unscaled signed 9 bit.  */
 299 static void
 300 ldursb32 (sim_cpu *cpu, int32_t offset)
 301 {
 302   unsigned rn = INSTR (9, 5);
 303   unsigned rt = INSTR (4, 0);
 304
 305   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 306   aarch64_set_reg_u64 (cpu, rt, NO_SP, (uint32_t) aarch64_get_mem_s8
 307                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 308                         + offset));
 309 }
 310
 311 /* 64 bit load sign-extended byte unscaled signed 9 bit.  */
 312 static void
 313 ldursb64 (sim_cpu *cpu, int32_t offset)
 314 {
 315   unsigned rn = INSTR (9, 5);
 316   unsigned rt = INSTR (4, 0);
 317
 318   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 319   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s8
 320                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 321                         + offset));
 322 }
 323
 324 /* 32 bit load zero-extended short unscaled signed 9 bit  */
 325 static void
 326 ldurh32 (sim_cpu *cpu, int32_t offset)
 327 {
 328   unsigned rn = INSTR (9, 5);
 329   unsigned rd = INSTR (4, 0);
 330
 331   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 332   aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_mem_u16
 333                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 334                         + offset));
 335 }
 336
 337 /* 32 bit load sign-extended short unscaled signed 9 bit  */
 338 static void
 339 ldursh32 (sim_cpu *cpu, int32_t offset)
 340 {
 341   unsigned rn = INSTR (9, 5);
 342   unsigned rd = INSTR (4, 0);
 343
 344   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 345   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) aarch64_get_mem_s16
 346                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 347                         + offset));
 348 }
 349
 350 /* 64 bit load sign-extended short unscaled signed 9 bit  */
 351 static void
 352 ldursh64 (sim_cpu *cpu, int32_t offset)
 353 {
 354   unsigned rn = INSTR (9, 5);
 355   unsigned rt = INSTR (4, 0);
 356
 357   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 358   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s16
 359                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 360                         + offset));
 361 }
 362
 363 /* 64 bit load sign-extended word unscaled signed 9 bit  */
 364 static void
 365 ldursw (sim_cpu *cpu, int32_t offset)
 366 {
 367   unsigned rn = INSTR (9, 5);
 368   unsigned rd = INSTR (4, 0);
 369
 370   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 371   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) aarch64_get_mem_s32
 372                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 373                         + offset));
 374 }
 375
 376 /* N.B. with stores the value in source is written to the address
 377    identified by source2 modified by offset.  */
 378
 379 /* 32 bit store 32 bit unscaled signed 9 bit.  */
 380 static void
 381 stur32 (sim_cpu *cpu, int32_t offset)
 382 {
 383   unsigned rn = INSTR (9, 5);
 384   unsigned rd = INSTR (4, 0);
 385
 386   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 387   aarch64_set_mem_u32 (cpu,
 388                        aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
 389                        aarch64_get_reg_u32 (cpu, rd, NO_SP));
 390 }
 391
 392 /* 64 bit store 64 bit unscaled signed 9 bit  */
 393 static void
 394 stur64 (sim_cpu *cpu, int32_t offset)
 395 {
 396   unsigned rn = INSTR (9, 5);
 397   unsigned rd = INSTR (4, 0);
 398
 399   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 400   aarch64_set_mem_u64 (cpu,
 401                        aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
 402                        aarch64_get_reg_u64 (cpu, rd, NO_SP));
 403 }
 404
 405 /* 32 bit store byte unscaled signed 9 bit  */
 406 static void
 407 sturb (sim_cpu *cpu, int32_t offset)
 408 {
 409   unsigned rn = INSTR (9, 5);
 410   unsigned rd = INSTR (4, 0);
 411
 412   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 413   aarch64_set_mem_u8 (cpu,
 414                       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
 415                       aarch64_get_reg_u8 (cpu, rd, NO_SP));
 416 }
 417
 418 /* 32 bit store short unscaled signed 9 bit  */
 419 static void
 420 sturh (sim_cpu *cpu, int32_t offset)
 421 {
 422   unsigned rn = INSTR (9, 5);
 423   unsigned rd = INSTR (4, 0);
 424
 425   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 426   aarch64_set_mem_u16 (cpu,
 427                        aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
 428                        aarch64_get_reg_u16 (cpu, rd, NO_SP));
 429 }
 430
 431 /* Load single register pc-relative label
 432    Offset is a signed 19 bit immediate count in words
 433    rt may not be SP.  */
 434
 435 /* 32 bit pc-relative load  */
 436 static void
 437 ldr32_pcrel (sim_cpu *cpu, int32_t offset)
 438 {
 439   unsigned rd = INSTR (4, 0);
 440
 441   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 442   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 443                        aarch64_get_mem_u32
 444                        (cpu, aarch64_get_PC (cpu) + offset * 4));
 445 }
 446
 447 /* 64 bit pc-relative load  */
 448 static void
 449 ldr_pcrel (sim_cpu *cpu, int32_t offset)
 450 {
 451   unsigned rd = INSTR (4, 0);
 452
 453   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 454   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 455                        aarch64_get_mem_u64
 456                        (cpu, aarch64_get_PC (cpu) + offset * 4));
 457 }
 458
 459 /* sign extended 32 bit pc-relative load  */
 460 static void
 461 ldrsw_pcrel (sim_cpu *cpu, int32_t offset)
 462 {
 463   unsigned rd = INSTR (4, 0);
 464
 465   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 466   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 467                        aarch64_get_mem_s32
 468                        (cpu, aarch64_get_PC (cpu) + offset * 4));
 469 }
 470
 471 /* float pc-relative load  */
 472 static void
 473 fldrs_pcrel (sim_cpu *cpu, int32_t offset)
 474 {
 475   unsigned int rd = INSTR (4, 0);
 476
 477   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 478   aarch64_set_vec_u32 (cpu, rd, 0,
 479                        aarch64_get_mem_u32
 480                        (cpu, aarch64_get_PC (cpu) + offset * 4));
 481 }
 482
 483 /* double pc-relative load  */
 484 static void
 485 fldrd_pcrel (sim_cpu *cpu, int32_t offset)
 486 {
 487   unsigned int st = INSTR (4, 0);
 488
 489   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 490   aarch64_set_vec_u64 (cpu, st, 0,
 491                        aarch64_get_mem_u64
 492                        (cpu, aarch64_get_PC (cpu) + offset * 4));
 493 }
 494
 495 /* long double pc-relative load.  */
 496 static void
 497 fldrq_pcrel (sim_cpu *cpu, int32_t offset)
 498 {
 499   unsigned int st = INSTR (4, 0);
 500   uint64_t addr = aarch64_get_PC (cpu) + offset * 4;
 501   FRegister a;
 502
 503   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 504   aarch64_get_mem_long_double (cpu, addr, & a);
 505   aarch64_set_FP_long_double (cpu, st, a);
 506 }
 507
 508 /* This can be used to scale an offset by applying
 509    the requisite shift. the second argument is either
 510    16, 32 or 64.  */
 511
 512 #define SCALE(_offset, _elementSize) \
 513     ((_offset) << ScaleShift ## _elementSize)
 514
 515 /* This can be used to optionally scale a register derived offset
 516    by applying the requisite shift as indicated by the Scaling
 517    argument.  The second argument is either Byte, Short, Word
 518    or Long. The third argument is either Scaled or Unscaled.
 519    N.B. when _Scaling is Scaled the shift gets ANDed with
 520    all 1s while when it is Unscaled it gets ANDed with 0.  */
 521
 522 #define OPT_SCALE(_offset, _elementType, _Scaling) \
 523   ((_offset) << (_Scaling ? ScaleShift ## _elementType : 0))
 524
 525 /* This can be used to zero or sign extend a 32 bit register derived
 526    value to a 64 bit value.  the first argument must be the value as
 527    a uint32_t and the second must be either UXTW or SXTW. The result
 528    is returned as an int64_t.  */
 529
 530 static inline int64_t
 531 extend (uint32_t value, Extension extension)
 532 {
 533   union
 534   {
 535     uint32_t u;
 536     int32_t   n;
 537   } x;
 538
 539   /* A branchless variant of this ought to be possible.  */
 540   if (extension == UXTW || extension == NoExtension)
 541     return value;
 542
 543   x.u = value;
 544   return x.n;
 545 }
 546
 547 /* Scalar Floating Point
 548
 549    FP load/store single register (4 addressing modes)
 550
 551    N.B. the base register (source) can be the stack pointer.
 552    The secondary source register (source2) can only be an Xn register.  */
 553
 554 /* Load 32 bit unscaled signed 9 bit with pre- or post-writeback.  */
 555 static void
 556 fldrs_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 557 {
 558   unsigned rn = INSTR (9, 5);
 559   unsigned st = INSTR (4, 0);
 560   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 561
 562   if (wb != Post)
 563     address += offset;
 564
 565   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 566   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32 (cpu, address));
 567   if (wb == Post)
 568     address += offset;
 569
 570   if (wb != NoWriteBack)
 571     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 572 }
 573
 574 /* Load 8 bit with unsigned 12 bit offset.  */
 575 static void
 576 fldrb_abs (sim_cpu *cpu, uint32_t offset)
 577 {
 578   unsigned rd = INSTR (4, 0);
 579   unsigned rn = INSTR (9, 5);
 580   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
 581
 582   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 583   aarch64_set_vec_u8 (cpu, rd, 0, aarch64_get_mem_u32 (cpu, addr));
 584 }
 585
 586 /* Load 16 bit scaled unsigned 12 bit.  */
 587 static void
 588 fldrh_abs (sim_cpu *cpu, uint32_t offset)
 589 {
 590   unsigned rd = INSTR (4, 0);
 591   unsigned rn = INSTR (9, 5);
 592   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 16);
 593
 594   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 595   aarch64_set_vec_u16 (cpu, rd, 0, aarch64_get_mem_u16 (cpu, addr));
 596 }
 597
 598 /* Load 32 bit scaled unsigned 12 bit.  */
 599 static void
 600 fldrs_abs (sim_cpu *cpu, uint32_t offset)
 601 {
 602   unsigned rd = INSTR (4, 0);
 603   unsigned rn = INSTR (9, 5);
 604   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 32);
 605
 606   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 607   aarch64_set_vec_u32 (cpu, rd, 0, aarch64_get_mem_u32 (cpu, addr));
 608 }
 609
 610 /* Load 64 bit scaled unsigned 12 bit.  */
 611 static void
 612 fldrd_abs (sim_cpu *cpu, uint32_t offset)
 613 {
 614   unsigned rd = INSTR (4, 0);
 615   unsigned rn = INSTR (9, 5);
 616   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 64);
 617
 618   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 619   aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_mem_u64 (cpu, addr));
 620 }
 621
 622 /* Load 128 bit scaled unsigned 12 bit.  */
 623 static void
 624 fldrq_abs (sim_cpu *cpu, uint32_t offset)
 625 {
 626   unsigned rd = INSTR (4, 0);
 627   unsigned rn = INSTR (9, 5);
 628   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 128);
 629
 630   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 631   aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_mem_u64 (cpu, addr));
 632   aarch64_set_vec_u64 (cpu, rd, 1, aarch64_get_mem_u64 (cpu, addr + 8));
 633 }
 634
 635 /* Load 32 bit scaled or unscaled zero- or sign-extended
 636    32-bit register offset.  */
 637 static void
 638 fldrs_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 639 {
 640   unsigned rm = INSTR (20, 16);
 641   unsigned rn = INSTR (9, 5);
 642   unsigned st = INSTR (4, 0);
 643   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 644   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
 645   uint64_t displacement = OPT_SCALE (extended, 32, scaling);
 646
 647   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 648   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32
 649                        (cpu, address + displacement));
 650 }
 651
 652 /* Load 64 bit unscaled signed 9 bit with pre- or post-writeback.  */
 653 static void
 654 fldrd_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 655 {
 656   unsigned rn = INSTR (9, 5);
 657   unsigned st = INSTR (4, 0);
 658   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 659
 660   if (wb != Post)
 661     address += offset;
 662
 663   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 664   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_mem_u64 (cpu, address));
 665
 666   if (wb == Post)
 667     address += offset;
 668
 669   if (wb != NoWriteBack)
 670     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 671 }
 672
 673 /* Load 64 bit scaled or unscaled zero- or sign-extended 32-bit register offset.  */
 674 static void
 675 fldrd_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 676 {
 677   unsigned rm = INSTR (20, 16);
 678   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
 679   uint64_t displacement = OPT_SCALE (extended, 64, scaling);
 680
 681   fldrd_wb (cpu, displacement, NoWriteBack);
 682 }
 683
 684 /* Load 128 bit unscaled signed 9 bit with pre- or post-writeback.  */
 685 static void
 686 fldrq_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 687 {
 688   FRegister a;
 689   unsigned rn = INSTR (9, 5);
 690   unsigned st = INSTR (4, 0);
 691   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 692
 693   if (wb != Post)
 694     address += offset;
 695
 696   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 697   aarch64_get_mem_long_double (cpu, address, & a);
 698   aarch64_set_FP_long_double (cpu, st, a);
 699
 700   if (wb == Post)
 701     address += offset;
 702
 703   if (wb != NoWriteBack)
 704     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 705 }
 706
 707 /* Load 128 bit scaled or unscaled zero- or sign-extended 32-bit register offset  */
 708 static void
 709 fldrq_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 710 {
 711   unsigned rm = INSTR (20, 16);
 712   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
 713   uint64_t displacement = OPT_SCALE (extended, 128, scaling);
 714
 715   fldrq_wb (cpu, displacement, NoWriteBack);
 716 }
 717
 718 /* Memory Access
 719
 720    load-store single register
 721    There are four addressing modes available here which all employ a
 722    64 bit source (base) register.
 723
 724    N.B. the base register (source) can be the stack pointer.
 725    The secondary source register (source2)can only be an Xn register.
 726
 727    Scaled, 12-bit, unsigned immediate offset, without pre- and
 728    post-index options.
 729    Unscaled, 9-bit, signed immediate offset with pre- or post-index
 730    writeback.
 731    scaled or unscaled 64-bit register offset.
 732    scaled or unscaled 32-bit extended register offset.
 733
 734    All offsets are assumed to be raw from the decode i.e. the
 735    simulator is expected to adjust scaled offsets based on the
 736    accessed data size with register or extended register offset
 737    versions the same applies except that in the latter case the
 738    operation may also require a sign extend.
 739
 740    A separate method is provided for each possible addressing mode.  */
 741
 742 /* 32 bit load 32 bit scaled unsigned 12 bit  */
 743 static void
 744 ldr32_abs (sim_cpu *cpu, uint32_t offset)
 745 {
 746   unsigned rn = INSTR (9, 5);
 747   unsigned rt = INSTR (4, 0);
 748
 749   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 750   /* The target register may not be SP but the source may be.  */
 751   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32
 752                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 753                         + SCALE (offset, 32)));
 754 }
 755
 756 /* 32 bit load 32 bit unscaled signed 9 bit with pre- or post-writeback.  */
 757 static void
 758 ldr32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 759 {
 760   unsigned rn = INSTR (9, 5);
 761   unsigned rt = INSTR (4, 0);
 762   uint64_t address;
 763
 764   if (rn == rt && wb != NoWriteBack)
 765     HALT_UNALLOC;
 766
 767   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 768
 769   if (wb != Post)
 770     address += offset;
 771
 772   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 773   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32 (cpu, address));
 774
 775   if (wb == Post)
 776     address += offset;
 777
 778   if (wb != NoWriteBack)
 779     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 780 }
 781
 782 /* 32 bit load 32 bit scaled or unscaled
 783    zero- or sign-extended 32-bit register offset  */
 784 static void
 785 ldr32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 786 {
 787   unsigned rm = INSTR (20, 16);
 788   unsigned rn = INSTR (9, 5);
 789   unsigned rt = INSTR (4, 0);
 790   /* rn may reference SP, rm and rt must reference ZR  */
 791
 792   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 793   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
 794   uint64_t displacement =  OPT_SCALE (extended, 32, scaling);
 795
 796   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 797   aarch64_set_reg_u64 (cpu, rt, NO_SP,
 798                        aarch64_get_mem_u32 (cpu, address + displacement));
 799 }
 800
 801 /* 64 bit load 64 bit scaled unsigned 12 bit  */
 802 static void
 803 ldr_abs (sim_cpu *cpu, uint32_t offset)
 804 {
 805   unsigned rn = INSTR (9, 5);
 806   unsigned rt = INSTR (4, 0);
 807
 808   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 809   /* The target register may not be SP but the source may be.  */
 810   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64
 811                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 812                         + SCALE (offset, 64)));
 813 }
 814
 815 /* 64 bit load 64 bit unscaled signed 9 bit with pre- or post-writeback.  */
 816 static void
 817 ldr_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 818 {
 819   unsigned rn = INSTR (9, 5);
 820   unsigned rt = INSTR (4, 0);
 821   uint64_t address;
 822
 823   if (rn == rt && wb != NoWriteBack)
 824     HALT_UNALLOC;
 825
 826   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 827
 828   if (wb != Post)
 829     address += offset;
 830
 831   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 832   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64 (cpu, address));
 833
 834   if (wb == Post)
 835     address += offset;
 836
 837   if (wb != NoWriteBack)
 838     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 839 }
 840
 841 /* 64 bit load 64 bit scaled or unscaled zero-
 842    or sign-extended 32-bit register offset.  */
 843 static void
 844 ldr_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 845 {
 846   unsigned rm = INSTR (20, 16);
 847   unsigned rn = INSTR (9, 5);
 848   unsigned rt = INSTR (4, 0);
 849   /* rn may reference SP, rm and rt must reference ZR  */
 850
 851   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 852   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
 853   uint64_t displacement =  OPT_SCALE (extended, 64, scaling);
 854
 855   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 856   aarch64_set_reg_u64 (cpu, rt, NO_SP,
 857                        aarch64_get_mem_u64 (cpu, address + displacement));
 858 }
 859
 860 /* 32 bit load zero-extended byte scaled unsigned 12 bit.  */
 861 static void
 862 ldrb32_abs (sim_cpu *cpu, uint32_t offset)
 863 {
 864   unsigned rn = INSTR (9, 5);
 865   unsigned rt = INSTR (4, 0);
 866
 867   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 868   /* The target register may not be SP but the source may be
 869      there is no scaling required for a byte load.  */
 870   aarch64_set_reg_u64 (cpu, rt, NO_SP,
 871                        aarch64_get_mem_u8
 872                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
 873 }
 874
 875 /* 32 bit load zero-extended byte unscaled signed 9 bit with pre- or post-writeback.  */
 876 static void
 877 ldrb32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 878 {
 879   unsigned rn = INSTR (9, 5);
 880   unsigned rt = INSTR (4, 0);
 881   uint64_t address;
 882
 883   if (rn == rt && wb != NoWriteBack)
 884     HALT_UNALLOC;
 885
 886   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 887
 888   if (wb != Post)
 889     address += offset;
 890
 891   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 892   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8 (cpu, address));
 893
 894   if (wb == Post)
 895     address += offset;
 896
 897   if (wb != NoWriteBack)
 898     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 899 }
 900
 901 /* 32 bit load zero-extended byte scaled or unscaled zero-
 902    or sign-extended 32-bit register offset.  */
 903 static void
 904 ldrb32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 905 {
 906   unsigned rm = INSTR (20, 16);
 907   unsigned rn = INSTR (9, 5);
 908   unsigned rt = INSTR (4, 0);
 909   /* rn may reference SP, rm and rt must reference ZR  */
 910
 911   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 912   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
 913                                  extension);
 914
 915   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 916   /* There is no scaling required for a byte load.  */
 917   aarch64_set_reg_u64 (cpu, rt, NO_SP,
 918                        aarch64_get_mem_u8 (cpu, address + displacement));
 919 }
 920
 921 /* 64 bit load sign-extended byte unscaled signed 9 bit
 922    with pre- or post-writeback.  */
 923 static void
 924 ldrsb_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 925 {
 926   unsigned rn = INSTR (9, 5);
 927   unsigned rt = INSTR (4, 0);
 928   uint64_t address;
 929   int64_t val;
 930
 931   if (rn == rt && wb != NoWriteBack)
 932     HALT_UNALLOC;
 933
 934   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 935
 936   if (wb != Post)
 937     address += offset;
 938
 939   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 940   val = aarch64_get_mem_s8 (cpu, address);
 941   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
 942
 943   if (wb == Post)
 944     address += offset;
 945
 946   if (wb != NoWriteBack)
 947     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 948 }
 949
 950 /* 64 bit load sign-extended byte scaled unsigned 12 bit.  */
 951 static void
 952 ldrsb_abs (sim_cpu *cpu, uint32_t offset)
 953 {
 954   ldrsb_wb (cpu, offset, NoWriteBack);
 955 }
 956
 957 /* 64 bit load sign-extended byte scaled or unscaled zero-
 958    or sign-extended 32-bit register offset.  */
 959 static void
 960 ldrsb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 961 {
 962   unsigned rm = INSTR (20, 16);
 963   unsigned rn = INSTR (9, 5);
 964   unsigned rt = INSTR (4, 0);
 965   /* rn may reference SP, rm and rt must reference ZR  */
 966
 967   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 968   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
 969                                  extension);
 970   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 971   /* There is no scaling required for a byte load.  */
 972   aarch64_set_reg_s64 (cpu, rt, NO_SP,
 973                        aarch64_get_mem_s8 (cpu, address + displacement));
 974 }
 975
 976 /* 32 bit load zero-extended short scaled unsigned 12 bit.  */
 977 static void
 978 ldrh32_abs (sim_cpu *cpu, uint32_t offset)
 979 {
 980   unsigned rn = INSTR (9, 5);
 981   unsigned rt = INSTR (4, 0);
 982   uint32_t val;
 983
 984   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 985   /* The target register may not be SP but the source may be.  */
 986   val = aarch64_get_mem_u16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 987                              + SCALE (offset, 16));
 988   aarch64_set_reg_u32 (cpu, rt, NO_SP, val);
 989 }
 990
 991 /* 32 bit load zero-extended short unscaled signed 9 bit
 992    with pre- or post-writeback.  */
 993 static void
 994 ldrh32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 995 {
 996   unsigned rn = INSTR (9, 5);
 997   unsigned rt = INSTR (4, 0);
 998   uint64_t address;
 999
1000   if (rn == rt && wb != NoWriteBack)
1001     HALT_UNALLOC;
1002
1003   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1004
1005   if (wb != Post)
1006     address += offset;
1007
1008   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1009   aarch64_set_reg_u32 (cpu, rt, NO_SP, aarch64_get_mem_u16 (cpu, address));
1010
1011   if (wb == Post)
1012     address += offset;
1013
1014   if (wb != NoWriteBack)
1015     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1016 }
1017
1018 /* 32 bit load zero-extended short scaled or unscaled zero-
1019    or sign-extended 32-bit register offset.  */
1020 static void
1021 ldrh32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1022 {
1023   unsigned rm = INSTR (20, 16);
1024   unsigned rn = INSTR (9, 5);
1025   unsigned rt = INSTR (4, 0);
1026   /* rn may reference SP, rm and rt must reference ZR  */
1027
1028   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1029   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1030   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
1031
1032   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1033   aarch64_set_reg_u32 (cpu, rt, NO_SP,
1034                        aarch64_get_mem_u16 (cpu, address + displacement));
1035 }
1036
1037 /* 32 bit load sign-extended short scaled unsigned 12 bit.  */
1038 static void
1039 ldrsh32_abs (sim_cpu *cpu, uint32_t offset)
1040 {
1041   unsigned rn = INSTR (9, 5);
1042   unsigned rt = INSTR (4, 0);
1043   int32_t val;
1044
1045   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1046   /* The target register may not be SP but the source may be.  */
1047   val = aarch64_get_mem_s16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1048                              + SCALE (offset, 16));
1049   aarch64_set_reg_s32 (cpu, rt, NO_SP, val);
1050 }
1051
1052 /* 32 bit load sign-extended short unscaled signed 9 bit
1053    with pre- or post-writeback.  */
1054 static void
1055 ldrsh32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1056 {
1057   unsigned rn = INSTR (9, 5);
1058   unsigned rt = INSTR (4, 0);
1059   uint64_t address;
1060
1061   if (rn == rt && wb != NoWriteBack)
1062     HALT_UNALLOC;
1063
1064   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1065
1066   if (wb != Post)
1067     address += offset;
1068
1069   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1070   aarch64_set_reg_s32 (cpu, rt, NO_SP,
1071                        (int32_t) aarch64_get_mem_s16 (cpu, address));
1072
1073   if (wb == Post)
1074     address += offset;
1075
1076   if (wb != NoWriteBack)
1077     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1078 }
1079
1080 /* 32 bit load sign-extended short scaled or unscaled zero-
1081    or sign-extended 32-bit register offset.  */
1082 static void
1083 ldrsh32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1084 {
1085   unsigned rm = INSTR (20, 16);
1086   unsigned rn = INSTR (9, 5);
1087   unsigned rt = INSTR (4, 0);
1088   /* rn may reference SP, rm and rt must reference ZR  */
1089
1090   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1091   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1092   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
1093
1094   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1095   aarch64_set_reg_s32 (cpu, rt, NO_SP,
1096                        (int32_t) aarch64_get_mem_s16
1097                        (cpu, address + displacement));
1098 }
1099
1100 /* 64 bit load sign-extended short scaled unsigned 12 bit.  */
1101 static void
1102 ldrsh_abs (sim_cpu *cpu, uint32_t offset)
1103 {
1104   unsigned rn = INSTR (9, 5);
1105   unsigned rt = INSTR (4, 0);
1106   int64_t val;
1107
1108   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1109   /* The target register may not be SP but the source may be.  */
1110   val = aarch64_get_mem_s16  (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1111                               + SCALE (offset, 16));
1112   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1113 }
1114
1115 /* 64 bit load sign-extended short unscaled signed 9 bit
1116    with pre- or post-writeback.  */
1117 static void
1118 ldrsh64_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1119 {
1120   unsigned rn = INSTR (9, 5);
1121   unsigned rt = INSTR (4, 0);
1122   uint64_t address;
1123   int64_t val;
1124
1125   if (rn == rt && wb != NoWriteBack)
1126     HALT_UNALLOC;
1127
1128   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1129   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1130
1131   if (wb != Post)
1132     address += offset;
1133
1134   val = aarch64_get_mem_s16 (cpu, address);
1135   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1136
1137   if (wb == Post)
1138     address += offset;
1139
1140   if (wb != NoWriteBack)
1141     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1142 }
1143
1144 /* 64 bit load sign-extended short scaled or unscaled zero-
1145    or sign-extended 32-bit register offset.  */
1146 static void
1147 ldrsh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1148 {
1149   unsigned rm = INSTR (20, 16);
1150   unsigned rn = INSTR (9, 5);
1151   unsigned rt = INSTR (4, 0);
1152
1153   /* rn may reference SP, rm and rt must reference ZR  */
1154
1155   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1156   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1157   uint64_t displacement = OPT_SCALE (extended, 16, scaling);
1158   int64_t val;
1159
1160   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1161   val = aarch64_get_mem_s16 (cpu, address + displacement);
1162   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1163 }
1164
1165 /* 64 bit load sign-extended 32 bit scaled unsigned 12 bit.  */
1166 static void
1167 ldrsw_abs (sim_cpu *cpu, uint32_t offset)
1168 {
1169   unsigned rn = INSTR (9, 5);
1170   unsigned rt = INSTR (4, 0);
1171   int64_t val;
1172
1173   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1174   val = aarch64_get_mem_s32 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1175                              + SCALE (offset, 32));
1176   /* The target register may not be SP but the source may be.  */
1177   return aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1178 }
1179
1180 /* 64 bit load sign-extended 32 bit unscaled signed 9 bit
1181    with pre- or post-writeback.  */
1182 static void
1183 ldrsw_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1184 {
1185   unsigned rn = INSTR (9, 5);
1186   unsigned rt = INSTR (4, 0);
1187   uint64_t address;
1188
1189   if (rn == rt && wb != NoWriteBack)
1190     HALT_UNALLOC;
1191
1192   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1193
1194   if (wb != Post)
1195     address += offset;
1196
1197   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1198   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s32 (cpu, address));
1199
1200   if (wb == Post)
1201     address += offset;
1202
1203   if (wb != NoWriteBack)
1204     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1205 }
1206
1207 /* 64 bit load sign-extended 32 bit scaled or unscaled zero-
1208    or sign-extended 32-bit register offset.  */
1209 static void
1210 ldrsw_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1211 {
1212   unsigned rm = INSTR (20, 16);
1213   unsigned rn = INSTR (9, 5);
1214   unsigned rt = INSTR (4, 0);
1215   /* rn may reference SP, rm and rt must reference ZR  */
1216
1217   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1218   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1219   uint64_t displacement =  OPT_SCALE (extended, 32, scaling);
1220
1221   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1222   aarch64_set_reg_s64 (cpu, rt, NO_SP,
1223                        aarch64_get_mem_s32 (cpu, address + displacement));
1224 }
1225
1226 /* N.B. with stores the value in source is written to the
1227    address identified by source2 modified by source3/offset.  */
1228
1229 /* 32 bit store scaled unsigned 12 bit.  */
1230 static void
1231 str32_abs (sim_cpu *cpu, uint32_t offset)
1232 {
1233   unsigned rn = INSTR (9, 5);
1234   unsigned rt = INSTR (4, 0);
1235
1236   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1237   /* The target register may not be SP but the source may be.  */
1238   aarch64_set_mem_u32 (cpu, (aarch64_get_reg_u64 (cpu, rn, SP_OK)
1239                              + SCALE (offset, 32)),
1240                        aarch64_get_reg_u32 (cpu, rt, NO_SP));
1241 }
1242
1243 /* 32 bit store unscaled signed 9 bit with pre- or post-writeback.  */
1244 static void
1245 str32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1246 {
1247   unsigned rn = INSTR (9, 5);
1248   unsigned rt = INSTR (4, 0);
1249   uint64_t address;
1250
1251   if (rn == rt && wb != NoWriteBack)
1252     HALT_UNALLOC;
1253
1254   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1255   if (wb != Post)
1256     address += offset;
1257
1258   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1259   aarch64_set_mem_u32 (cpu, address, aarch64_get_reg_u32 (cpu, rt, NO_SP));
1260
1261   if (wb == Post)
1262     address += offset;
1263
1264   if (wb != NoWriteBack)
1265     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1266 }
1267
1268 /* 32 bit store scaled or unscaled zero- or
1269    sign-extended 32-bit register offset.  */
1270 static void
1271 str32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1272 {
1273   unsigned rm = INSTR (20, 16);
1274   unsigned rn = INSTR (9, 5);
1275   unsigned rt = INSTR (4, 0);
1276
1277   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1278   int64_t  extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1279   uint64_t displacement = OPT_SCALE (extended, 32, scaling);
1280
1281   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1282   aarch64_set_mem_u32 (cpu, address + displacement,
1283                        aarch64_get_reg_u64 (cpu, rt, NO_SP));
1284 }
1285
1286 /* 64 bit store scaled unsigned 12 bit.  */
1287 static void
1288 str_abs (sim_cpu *cpu, uint32_t offset)
1289 {
1290   unsigned rn = INSTR (9, 5);
1291   unsigned rt = INSTR (4, 0);
1292
1293   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1294   aarch64_set_mem_u64 (cpu,
1295                        aarch64_get_reg_u64 (cpu, rn, SP_OK)
1296                        + SCALE (offset, 64),
1297                        aarch64_get_reg_u64 (cpu, rt, NO_SP));
1298 }
1299
1300 /* 64 bit store unscaled signed 9 bit with pre- or post-writeback.  */
1301 static void
1302 str_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1303 {
1304   unsigned rn = INSTR (9, 5);
1305   unsigned rt = INSTR (4, 0);
1306   uint64_t address;
1307
1308   if (rn == rt && wb != NoWriteBack)
1309     HALT_UNALLOC;
1310
1311   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1312
1313   if (wb != Post)
1314     address += offset;
1315
1316   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1317   aarch64_set_mem_u64 (cpu, address, aarch64_get_reg_u64 (cpu, rt, NO_SP));
1318
1319   if (wb == Post)
1320     address += offset;
1321
1322   if (wb != NoWriteBack)
1323     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1324 }
1325
1326 /* 64 bit store scaled or unscaled zero-
1327    or sign-extended 32-bit register offset.  */
1328 static void
1329 str_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1330 {
1331   unsigned rm = INSTR (20, 16);
1332   unsigned rn = INSTR (9, 5);
1333   unsigned rt = INSTR (4, 0);
1334   /* rn may reference SP, rm and rt must reference ZR  */
1335
1336   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1337   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1338                                extension);
1339   uint64_t displacement = OPT_SCALE (extended, 64, scaling);
1340
1341   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1342   aarch64_set_mem_u64 (cpu, address + displacement,
1343                        aarch64_get_reg_u64 (cpu, rt, NO_SP));
1344 }
1345
1346 /* 32 bit store byte scaled unsigned 12 bit.  */
1347 static void
1348 strb_abs (sim_cpu *cpu, uint32_t offset)
1349 {
1350   unsigned rn = INSTR (9, 5);
1351   unsigned rt = INSTR (4, 0);
1352
1353   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1354   /* The target register may not be SP but the source may be.
1355      There is no scaling required for a byte load.  */
1356   aarch64_set_mem_u8 (cpu,
1357                       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
1358                       aarch64_get_reg_u8 (cpu, rt, NO_SP));
1359 }
1360
1361 /* 32 bit store byte unscaled signed 9 bit with pre- or post-writeback.  */
1362 static void
1363 strb_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1364 {
1365   unsigned rn = INSTR (9, 5);
1366   unsigned rt = INSTR (4, 0);
1367   uint64_t address;
1368
1369   if (rn == rt && wb != NoWriteBack)
1370     HALT_UNALLOC;
1371
1372   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1373
1374   if (wb != Post)
1375     address += offset;
1376
1377   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1378   aarch64_set_mem_u8 (cpu, address, aarch64_get_reg_u8 (cpu, rt, NO_SP));
1379
1380   if (wb == Post)
1381     address += offset;
1382
1383   if (wb != NoWriteBack)
1384     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1385 }
1386
1387 /* 32 bit store byte scaled or unscaled zero-
1388    or sign-extended 32-bit register offset.  */
1389 static void
1390 strb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1391 {
1392   unsigned rm = INSTR (20, 16);
1393   unsigned rn = INSTR (9, 5);
1394   unsigned rt = INSTR (4, 0);
1395   /* rn may reference SP, rm and rt must reference ZR  */
1396
1397   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1398   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1399                                  extension);
1400
1401   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1402   /* There is no scaling required for a byte load.  */
1403   aarch64_set_mem_u8 (cpu, address + displacement,
1404                       aarch64_get_reg_u8 (cpu, rt, NO_SP));
1405 }
1406
1407 /* 32 bit store short scaled unsigned 12 bit.  */
1408 static void
1409 strh_abs (sim_cpu *cpu, uint32_t offset)
1410 {
1411   unsigned rn = INSTR (9, 5);
1412   unsigned rt = INSTR (4, 0);
1413
1414   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1415   /* The target register may not be SP but the source may be.  */
1416   aarch64_set_mem_u16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1417                        + SCALE (offset, 16),
1418                        aarch64_get_reg_u16 (cpu, rt, NO_SP));
1419 }
1420
1421 /* 32 bit store short unscaled signed 9 bit with pre- or post-writeback.  */
1422 static void
1423 strh_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1424 {
1425   unsigned rn = INSTR (9, 5);
1426   unsigned rt = INSTR (4, 0);
1427   uint64_t address;
1428
1429   if (rn == rt && wb != NoWriteBack)
1430     HALT_UNALLOC;
1431
1432   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1433
1434   if (wb != Post)
1435     address += offset;
1436
1437   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1438   aarch64_set_mem_u16 (cpu, address, aarch64_get_reg_u16 (cpu, rt, NO_SP));
1439
1440   if (wb == Post)
1441     address += offset;
1442
1443   if (wb != NoWriteBack)
1444     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1445 }
1446
1447 /* 32 bit store short scaled or unscaled zero-
1448    or sign-extended 32-bit register offset.  */
1449 static void
1450 strh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1451 {
1452   unsigned rm = INSTR (20, 16);
1453   unsigned rn = INSTR (9, 5);
1454   unsigned rt = INSTR (4, 0);
1455   /* rn may reference SP, rm and rt must reference ZR  */
1456
1457   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1458   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1459   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
1460
1461   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1462   aarch64_set_mem_u16 (cpu, address + displacement,
1463                        aarch64_get_reg_u16 (cpu, rt, NO_SP));
1464 }
1465
1466 /* Prefetch unsigned 12 bit.  */
1467 static void
1468 prfm_abs (sim_cpu *cpu, uint32_t offset)
1469 {
1470   /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
1471                           00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
1472                           00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
1473                           10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
1474                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
1475                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
1476                           ow ==> UNALLOC
1477      PrfOp prfop = prfop (instr, 4, 0);
1478      uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK)
1479      + SCALE (offset, 64).  */
1480
1481   /* TODO : implement prefetch of address.  */
1482 }
1483
1484 /* Prefetch scaled or unscaled zero- or sign-extended 32-bit register offset.  */
1485 static void
1486 prfm_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1487 {
1488   /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
1489                           00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
1490                           00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
1491                           10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
1492                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
1493                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
1494                           ow ==> UNALLOC
1495      rn may reference SP, rm may only reference ZR
1496      PrfOp prfop = prfop (instr, 4, 0);
1497      uint64_t base = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1498      int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1499                                 extension);
1500      uint64_t displacement =  OPT_SCALE (extended, 64, scaling);
1501      uint64_t address = base + displacement.  */
1502
1503   /* TODO : implement prefetch of address  */
1504 }
1505
1506 /* 64 bit pc-relative prefetch.  */
1507 static void
1508 prfm_pcrel (sim_cpu *cpu, int32_t offset)
1509 {
1510   /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
1511                           00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
1512                           00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
1513                           10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
1514                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
1515                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
1516                           ow ==> UNALLOC
1517      PrfOp prfop = prfop (instr, 4, 0);
1518      uint64_t address = aarch64_get_PC (cpu) + offset.  */
1519
1520   /* TODO : implement this  */
1521 }
1522
1523 /* Load-store exclusive.  */
1524
1525 static void
1526 ldxr (sim_cpu *cpu)
1527 {
1528   unsigned rn = INSTR (9, 5);
1529   unsigned rt = INSTR (4, 0);
1530   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1531   int size = INSTR (31, 30);
1532   /* int ordered = INSTR (15, 15);  */
1533   /* int exclusive = ! INSTR (23, 23);  */
1534
1535   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1536   switch (size)
1537     {
1538     case 0:
1539       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8 (cpu, address));
1540       break;
1541     case 1:
1542       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u16 (cpu, address));
1543       break;
1544     case 2:
1545       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32 (cpu, address));
1546       break;
1547     case 3:
1548       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64 (cpu, address));
1549       break;
1550     }
1551 }
1552
1553 static void
1554 stxr (sim_cpu *cpu)
1555 {
1556   unsigned rn = INSTR (9, 5);
1557   unsigned rt = INSTR (4, 0);
1558   unsigned rs = INSTR (20, 16);
1559   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1560   int      size = INSTR (31, 30);
1561   uint64_t data = aarch64_get_reg_u64 (cpu, rt, NO_SP);
1562
1563   switch (size)
1564     {
1565     case 0: aarch64_set_mem_u8 (cpu, address, data); break;
1566     case 1: aarch64_set_mem_u16 (cpu, address, data); break;
1567     case 2: aarch64_set_mem_u32 (cpu, address, data); break;
1568     case 3: aarch64_set_mem_u64 (cpu, address, data); break;
1569     }
1570
1571   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1572   aarch64_set_reg_u64 (cpu, rs, NO_SP, 0); /* Always exclusive...  */
1573 }
1574
1575 static void
1576 dexLoadLiteral (sim_cpu *cpu)
1577 {
1578   /* instr[29,27] == 011
1579      instr[25,24] == 00
1580      instr[31,30:26] = opc: 000 ==> LDRW,  001 ==> FLDRS
1581                             010 ==> LDRX,  011 ==> FLDRD
1582                             100 ==> LDRSW, 101 ==> FLDRQ
1583                             110 ==> PRFM, 111 ==> UNALLOC
1584      instr[26] ==> V : 0 ==> GReg, 1 ==> FReg
1585      instr[23, 5] == simm19  */
1586
1587   /* unsigned rt = INSTR (4, 0);  */
1588   uint32_t dispatch = (INSTR (31, 30) << 1) | INSTR (26, 26);
1589   int32_t imm = simm32 (aarch64_get_instr (cpu), 23, 5);
1590
1591   switch (dispatch)
1592     {
1593     case 0: ldr32_pcrel (cpu, imm); break;
1594     case 1: fldrs_pcrel (cpu, imm); break;
1595     case 2: ldr_pcrel   (cpu, imm); break;
1596     case 3: fldrd_pcrel (cpu, imm); break;
1597     case 4: ldrsw_pcrel (cpu, imm); break;
1598     case 5: fldrq_pcrel (cpu, imm); break;
1599     case 6: prfm_pcrel  (cpu, imm); break;
1600     case 7:
1601     default:
1602       HALT_UNALLOC;
1603     }
1604 }
1605
1606 /* Immediate arithmetic
1607    The aimm argument is a 12 bit unsigned value or a 12 bit unsigned
1608    value left shifted by 12 bits (done at decode).
1609
1610    N.B. the register args (dest, source) can normally be Xn or SP.
1611    the exception occurs for flag setting instructions which may
1612    only use Xn for the output (dest).  */
1613
1614 /* 32 bit add immediate.  */
1615 static void
1616 add32 (sim_cpu *cpu, uint32_t aimm)
1617 {
1618   unsigned rn = INSTR (9, 5);
1619   unsigned rd = INSTR (4, 0);
1620
1621   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1622   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1623                        aarch64_get_reg_u32 (cpu, rn, SP_OK) + aimm);
1624 }
1625
1626 /* 64 bit add immediate.  */
1627 static void
1628 add64 (sim_cpu *cpu, uint32_t aimm)
1629 {
1630   unsigned rn = INSTR (9, 5);
1631   unsigned rd = INSTR (4, 0);
1632
1633   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1634   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1635                        aarch64_get_reg_u64 (cpu, rn, SP_OK) + aimm);
1636 }
1637
1638 static void
1639 set_flags_for_add32 (sim_cpu *cpu, int32_t value1, int32_t value2)
1640 {
1641   int32_t   result = value1 + value2;
1642   int64_t   sresult = (int64_t) value1 + (int64_t) value2;
1643   uint64_t  uresult = (uint64_t)(uint32_t) value1
1644     + (uint64_t)(uint32_t) value2;
1645   uint32_t  flags = 0;
1646
1647   if (result == 0)
1648     flags |= Z;
1649
1650   if (result & (1 << 31))
1651     flags |= N;
1652
1653   if (uresult != result)
1654     flags |= C;
1655
1656   if (sresult != result)
1657     flags |= V;
1658
1659   aarch64_set_CPSR (cpu, flags);
1660 }
1661
1662 #define NEG(a) (((a) & signbit) == signbit)
1663 #define POS(a) (((a) & signbit) == 0)
1664
1665 static void
1666 set_flags_for_add64 (sim_cpu *cpu, uint64_t value1, uint64_t value2)
1667 {
1668   uint64_t result = value1 + value2;
1669   uint32_t flags = 0;
1670   uint64_t signbit = 1ULL << 63;
1671
1672   if (result == 0)
1673     flags |= Z;
1674
1675   if (NEG (result))
1676     flags |= N;
1677
1678   if (   (NEG (value1) && NEG (value2))
1679       || (NEG (value1) && POS (result))
1680       || (NEG (value2) && POS (result)))
1681     flags |= C;
1682
1683   if (   (NEG (value1) && NEG (value2) && POS (result))
1684       || (POS (value1) && POS (value2) && NEG (result)))
1685     flags |= V;
1686
1687   aarch64_set_CPSR (cpu, flags);
1688 }
1689
1690 static void
1691 set_flags_for_sub32 (sim_cpu *cpu, uint32_t value1, uint32_t value2)
1692 {
1693   uint32_t result = value1 - value2;
1694   uint32_t flags = 0;
1695   uint32_t signbit = 1U << 31;
1696
1697   if (result == 0)
1698     flags |= Z;
1699
1700   if (NEG (result))
1701     flags |= N;
1702
1703   if (   (NEG (value1) && POS (value2))
1704       || (NEG (value1) && POS (result))
1705       || (POS (value2) && POS (result)))
1706     flags |= C;
1707
1708   if (   (NEG (value1) && POS (value2) && POS (result))
1709       || (POS (value1) && NEG (value2) && NEG (result)))
1710     flags |= V;
1711
1712   aarch64_set_CPSR (cpu, flags);
1713 }
1714
1715 static void
1716 set_flags_for_sub64 (sim_cpu *cpu, uint64_t value1, uint64_t value2)
1717 {
1718   uint64_t result = value1 - value2;
1719   uint32_t flags = 0;
1720   uint64_t signbit = 1ULL << 63;
1721
1722   if (result == 0)
1723     flags |= Z;
1724
1725   if (NEG (result))
1726     flags |= N;
1727
1728   if (   (NEG (value1) && POS (value2))
1729       || (NEG (value1) && POS (result))
1730       || (POS (value2) && POS (result)))
1731     flags |= C;
1732
1733   if (   (NEG (value1) && POS (value2) && POS (result))
1734       || (POS (value1) && NEG (value2) && NEG (result)))
1735     flags |= V;
1736
1737   aarch64_set_CPSR (cpu, flags);
1738 }
1739
1740 static void
1741 set_flags_for_binop32 (sim_cpu *cpu, uint32_t result)
1742 {
1743   uint32_t flags = 0;
1744
1745   if (result == 0)
1746     flags |= Z;
1747   else
1748     flags &= ~ Z;
1749
1750   if (result & (1 << 31))
1751     flags |= N;
1752   else
1753     flags &= ~ N;
1754
1755   aarch64_set_CPSR (cpu, flags);
1756 }
1757
1758 static void
1759 set_flags_for_binop64 (sim_cpu *cpu, uint64_t result)
1760 {
1761   uint32_t flags = 0;
1762
1763   if (result == 0)
1764     flags |= Z;
1765   else
1766     flags &= ~ Z;
1767
1768   if (result & (1ULL << 63))
1769     flags |= N;
1770   else
1771     flags &= ~ N;
1772
1773   aarch64_set_CPSR (cpu, flags);
1774 }
1775
1776 /* 32 bit add immediate set flags.  */
1777 static void
1778 adds32 (sim_cpu *cpu, uint32_t aimm)
1779 {
1780   unsigned rn = INSTR (9, 5);
1781   unsigned rd = INSTR (4, 0);
1782   /* TODO : do we need to worry about signs here?  */
1783   int32_t value1 = aarch64_get_reg_s32 (cpu, rn, SP_OK);
1784
1785   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1786   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + aimm);
1787   set_flags_for_add32 (cpu, value1, aimm);
1788 }
1789
1790 /* 64 bit add immediate set flags.  */
1791 static void
1792 adds64 (sim_cpu *cpu, uint32_t aimm)
1793 {
1794   unsigned rn = INSTR (9, 5);
1795   unsigned rd = INSTR (4, 0);
1796   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1797   uint64_t value2 = aimm;
1798
1799   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1800   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
1801   set_flags_for_add64 (cpu, value1, value2);
1802 }
1803
1804 /* 32 bit sub immediate.  */
1805 static void
1806 sub32 (sim_cpu *cpu, uint32_t aimm)
1807 {
1808   unsigned rn = INSTR (9, 5);
1809   unsigned rd = INSTR (4, 0);
1810
1811   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1812   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1813                        aarch64_get_reg_u32 (cpu, rn, SP_OK) - aimm);
1814 }
1815
1816 /* 64 bit sub immediate.  */
1817 static void
1818 sub64 (sim_cpu *cpu, uint32_t aimm)
1819 {
1820   unsigned rn = INSTR (9, 5);
1821   unsigned rd = INSTR (4, 0);
1822
1823   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1824   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1825                        aarch64_get_reg_u64 (cpu, rn, SP_OK) - aimm);
1826 }
1827
1828 /* 32 bit sub immediate set flags.  */
1829 static void
1830 subs32 (sim_cpu *cpu, uint32_t aimm)
1831 {
1832   unsigned rn = INSTR (9, 5);
1833   unsigned rd = INSTR (4, 0);
1834   uint32_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1835   uint32_t value2 = aimm;
1836
1837   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1838   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
1839   set_flags_for_sub32 (cpu, value1, value2);
1840 }
1841
1842 /* 64 bit sub immediate set flags.  */
1843 static void
1844 subs64 (sim_cpu *cpu, uint32_t aimm)
1845 {
1846   unsigned rn = INSTR (9, 5);
1847   unsigned rd = INSTR (4, 0);
1848   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1849   uint32_t value2 = aimm;
1850
1851   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1852   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
1853   set_flags_for_sub64 (cpu, value1, value2);
1854 }
1855
1856 /* Data Processing Register.  */
1857
1858 /* First two helpers to perform the shift operations.  */
1859
1860 static inline uint32_t
1861 shifted32 (uint32_t value, Shift shift, uint32_t count)
1862 {
1863   switch (shift)
1864     {
1865     default:
1866     case LSL:
1867       return (value << count);
1868     case LSR:
1869       return (value >> count);
1870     case ASR:
1871       {
1872         int32_t svalue = value;
1873         return (svalue >> count);
1874       }
1875     case ROR:
1876       {
1877         uint32_t top = value >> count;
1878         uint32_t bottom = value << (32 - count);
1879         return (bottom | top);
1880       }
1881     }
1882 }
1883
1884 static inline uint64_t
1885 shifted64 (uint64_t value, Shift shift, uint32_t count)
1886 {
1887   switch (shift)
1888     {
1889     default:
1890     case LSL:
1891       return (value << count);
1892     case LSR:
1893       return (value >> count);
1894     case ASR:
1895       {
1896         int64_t svalue = value;
1897         return (svalue >> count);
1898       }
1899     case ROR:
1900       {
1901         uint64_t top = value >> count;
1902         uint64_t bottom = value << (64 - count);
1903         return (bottom | top);
1904       }
1905     }
1906 }
1907
1908 /* Arithmetic shifted register.
1909    These allow an optional LSL, ASR or LSR to the second source
1910    register with a count up to the register bit count.
1911
1912    N.B register args may not be SP.  */
1913
1914 /* 32 bit ADD shifted register.  */
1915 static void
1916 add32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1917 {
1918   unsigned rm = INSTR (20, 16);
1919   unsigned rn = INSTR (9, 5);
1920   unsigned rd = INSTR (4, 0);
1921
1922   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1923   aarch64_set_reg_u64 (cpu, rd, NO_SP,
1924                        aarch64_get_reg_u32 (cpu, rn, NO_SP)
1925                        + shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1926                                     shift, count));
1927 }
1928
1929 /* 64 bit ADD shifted register.  */
1930 static void
1931 add64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1932 {
1933   unsigned rm = INSTR (20, 16);
1934   unsigned rn = INSTR (9, 5);
1935   unsigned rd = INSTR (4, 0);
1936
1937   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1938   aarch64_set_reg_u64 (cpu, rd, NO_SP,
1939                        aarch64_get_reg_u64 (cpu, rn, NO_SP)
1940                        + shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
1941                                     shift, count));
1942 }
1943
1944 /* 32 bit ADD shifted register setting flags.  */
1945 static void
1946 adds32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1947 {
1948   unsigned rm = INSTR (20, 16);
1949   unsigned rn = INSTR (9, 5);
1950   unsigned rd = INSTR (4, 0);
1951
1952   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
1953   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1954                                shift, count);
1955
1956   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1957   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
1958   set_flags_for_add32 (cpu, value1, value2);
1959 }
1960
1961 /* 64 bit ADD shifted register setting flags.  */
1962 static void
1963 adds64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1964 {
1965   unsigned rm = INSTR (20, 16);
1966   unsigned rn = INSTR (9, 5);
1967   unsigned rd = INSTR (4, 0);
1968
1969   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
1970   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
1971                                shift, count);
1972
1973   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1974   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
1975   set_flags_for_add64 (cpu, value1, value2);
1976 }
1977
1978 /* 32 bit SUB shifted register.  */
1979 static void
1980 sub32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1981 {
1982   unsigned rm = INSTR (20, 16);
1983   unsigned rn = INSTR (9, 5);
1984   unsigned rd = INSTR (4, 0);
1985
1986   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1987   aarch64_set_reg_u64 (cpu, rd, NO_SP,
1988                        aarch64_get_reg_u32 (cpu, rn, NO_SP)
1989                        - shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1990                                     shift, count));
1991 }
1992
1993 /* 64 bit SUB shifted register.  */
1994 static void
1995 sub64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1996 {
1997   unsigned rm = INSTR (20, 16);
1998   unsigned rn = INSTR (9, 5);
1999   unsigned rd = INSTR (4, 0);
2000
2001   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2002   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2003                        aarch64_get_reg_u64 (cpu, rn, NO_SP)
2004                        - shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
2005                                     shift, count));
2006 }
2007
2008 /* 32 bit SUB shifted register setting flags.  */
2009 static void
2010 subs32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
2011 {
2012   unsigned rm = INSTR (20, 16);
2013   unsigned rn = INSTR (9, 5);
2014   unsigned rd = INSTR (4, 0);
2015
2016   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
2017   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
2018                               shift, count);
2019
2020   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2021   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2022   set_flags_for_sub32 (cpu, value1, value2);
2023 }
2024
2025 /* 64 bit SUB shifted register setting flags.  */
2026 static void
2027 subs64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
2028 {
2029   unsigned rm = INSTR (20, 16);
2030   unsigned rn = INSTR (9, 5);
2031   unsigned rd = INSTR (4, 0);
2032
2033   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
2034   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
2035                                shift, count);
2036
2037   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2038   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2039   set_flags_for_sub64 (cpu, value1, value2);
2040 }
2041
2042 /* First a couple more helpers to fetch the
2043    relevant source register element either
2044    sign or zero extended as required by the
2045    extension value.  */
2046
2047 static uint32_t
2048 extreg32 (sim_cpu *cpu, unsigned int lo, Extension extension)
2049 {
2050   switch (extension)
2051     {
2052     case UXTB: return aarch64_get_reg_u8  (cpu, lo, NO_SP);
2053     case UXTH: return aarch64_get_reg_u16 (cpu, lo, NO_SP);
2054     case UXTW: /* Fall through.  */
2055     case UXTX: return aarch64_get_reg_u32 (cpu, lo, NO_SP);
2056     case SXTB: return aarch64_get_reg_s8  (cpu, lo, NO_SP);
2057     case SXTH: return aarch64_get_reg_s16 (cpu, lo, NO_SP);
2058     case SXTW: /* Fall through.  */
2059     case SXTX: /* Fall through.  */
2060     default:   return aarch64_get_reg_s32 (cpu, lo, NO_SP);
2061   }
2062 }
2063
2064 static uint64_t
2065 extreg64 (sim_cpu *cpu, unsigned int lo, Extension extension)
2066 {
2067   switch (extension)
2068     {
2069     case UXTB: return aarch64_get_reg_u8  (cpu, lo, NO_SP);
2070     case UXTH: return aarch64_get_reg_u16 (cpu, lo, NO_SP);
2071     case UXTW: return aarch64_get_reg_u32 (cpu, lo, NO_SP);
2072     case UXTX: return aarch64_get_reg_u64 (cpu, lo, NO_SP);
2073     case SXTB: return aarch64_get_reg_s8  (cpu, lo, NO_SP);
2074     case SXTH: return aarch64_get_reg_s16 (cpu, lo, NO_SP);
2075     case SXTW: return aarch64_get_reg_s32 (cpu, lo, NO_SP);
2076     case SXTX:
2077     default:   return aarch64_get_reg_s64 (cpu, lo, NO_SP);
2078     }
2079 }
2080
2081 /* Arithmetic extending register
2082    These allow an optional sign extension of some portion of the
2083    second source register followed by an optional left shift of
2084    between 1 and 4 bits (i.e. a shift of 0-4 bits???)
2085
2086    N.B output (dest) and first input arg (source) may normally be Xn
2087    or SP. However, for flag setting operations dest can only be
2088    Xn. Second input registers are always Xn.  */
2089
2090 /* 32 bit ADD extending register.  */
2091 static void
2092 add32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2093 {
2094   unsigned rm = INSTR (20, 16);
2095   unsigned rn = INSTR (9, 5);
2096   unsigned rd = INSTR (4, 0);
2097
2098   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2099   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2100                        aarch64_get_reg_u32 (cpu, rn, SP_OK)
2101                        + (extreg32 (cpu, rm, extension) << shift));
2102 }
2103
2104 /* 64 bit ADD extending register.
2105    N.B. This subsumes the case with 64 bit source2 and UXTX #n or LSL #0.  */
2106 static void
2107 add64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2108 {
2109   unsigned rm = INSTR (20, 16);
2110   unsigned rn = INSTR (9, 5);
2111   unsigned rd = INSTR (4, 0);
2112
2113   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2114   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2115                        aarch64_get_reg_u64 (cpu, rn, SP_OK)
2116                        + (extreg64 (cpu, rm, extension) << shift));
2117 }
2118
2119 /* 32 bit ADD extending register setting flags.  */
2120 static void
2121 adds32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2122 {
2123   unsigned rm = INSTR (20, 16);
2124   unsigned rn = INSTR (9, 5);
2125   unsigned rd = INSTR (4, 0);
2126
2127   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, SP_OK);
2128   uint32_t value2 = extreg32 (cpu, rm, extension) << shift;
2129
2130   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2131   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
2132   set_flags_for_add32 (cpu, value1, value2);
2133 }
2134
2135 /* 64 bit ADD extending register setting flags  */
2136 /* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0  */
2137 static void
2138 adds64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2139 {
2140   unsigned rm = INSTR (20, 16);
2141   unsigned rn = INSTR (9, 5);
2142   unsigned rd = INSTR (4, 0);
2143
2144   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
2145   uint64_t value2 = extreg64 (cpu, rm, extension) << shift;
2146
2147   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2148   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
2149   set_flags_for_add64 (cpu, value1, value2);
2150 }
2151
2152 /* 32 bit SUB extending register.  */
2153 static void
2154 sub32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2155 {
2156   unsigned rm = INSTR (20, 16);
2157   unsigned rn = INSTR (9, 5);
2158   unsigned rd = INSTR (4, 0);
2159
2160   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2161   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2162                        aarch64_get_reg_u32 (cpu, rn, SP_OK)
2163                        - (extreg32 (cpu, rm, extension) << shift));
2164 }
2165
2166 /* 64 bit SUB extending register.  */
2167 /* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0.  */
2168 static void
2169 sub64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2170 {
2171   unsigned rm = INSTR (20, 16);
2172   unsigned rn = INSTR (9, 5);
2173   unsigned rd = INSTR (4, 0);
2174
2175   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2176   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2177                        aarch64_get_reg_u64 (cpu, rn, SP_OK)
2178                        - (extreg64 (cpu, rm, extension) << shift));
2179 }
2180
2181 /* 32 bit SUB extending register setting flags.  */
2182 static void
2183 subs32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2184 {
2185   unsigned rm = INSTR (20, 16);
2186   unsigned rn = INSTR (9, 5);
2187   unsigned rd = INSTR (4, 0);
2188
2189   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, SP_OK);
2190   uint32_t value2 = extreg32 (cpu, rm, extension) << shift;
2191
2192   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2193   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2194   set_flags_for_sub32 (cpu, value1, value2);
2195 }
2196
2197 /* 64 bit SUB extending register setting flags  */
2198 /* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0  */
2199 static void
2200 subs64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2201 {
2202   unsigned rm = INSTR (20, 16);
2203   unsigned rn = INSTR (9, 5);
2204   unsigned rd = INSTR (4, 0);
2205
2206   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
2207   uint64_t value2 = extreg64 (cpu, rm, extension) << shift;
2208
2209   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2210   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2211   set_flags_for_sub64 (cpu, value1, value2);
2212 }
2213
2214 static void
2215 dexAddSubtractImmediate (sim_cpu *cpu)
2216 {
2217   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2218      instr[30]    = op : 0 ==> ADD, 1 ==> SUB
2219      instr[29]    = set : 0 ==> no flags, 1 ==> set flags
2220      instr[28,24] = 10001
2221      instr[23,22] = shift : 00 == LSL#0, 01 = LSL#12 1x = UNALLOC
2222      instr[21,10] = uimm12
2223      instr[9,5]   = Rn
2224      instr[4,0]   = Rd  */
2225
2226   /* N.B. the shift is applied at decode before calling the add/sub routine.  */
2227   uint32_t shift = INSTR (23, 22);
2228   uint32_t imm = INSTR (21, 10);
2229   uint32_t dispatch = INSTR (31, 29);
2230
2231   NYI_assert (28, 24, 0x11);
2232
2233   if (shift > 1)
2234     HALT_UNALLOC;
2235
2236   if (shift)
2237     imm <<= 12;
2238
2239   switch (dispatch)
2240     {
2241     case 0: add32 (cpu, imm); break;
2242     case 1: adds32 (cpu, imm); break;
2243     case 2: sub32 (cpu, imm); break;
2244     case 3: subs32 (cpu, imm); break;
2245     case 4: add64 (cpu, imm); break;
2246     case 5: adds64 (cpu, imm); break;
2247     case 6: sub64 (cpu, imm); break;
2248     case 7: subs64 (cpu, imm); break;
2249     }
2250 }
2251
2252 static void
2253 dexAddSubtractShiftedRegister (sim_cpu *cpu)
2254 {
2255   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2256      instr[30,29] = op : 00 ==> ADD, 01 ==> ADDS, 10 ==> SUB, 11 ==> SUBS
2257      instr[28,24] = 01011
2258      instr[23,22] = shift : 0 ==> LSL, 1 ==> LSR, 2 ==> ASR, 3 ==> UNALLOC
2259      instr[21]    = 0
2260      instr[20,16] = Rm
2261      instr[15,10] = count : must be 0xxxxx for 32 bit
2262      instr[9,5]   = Rn
2263      instr[4,0]   = Rd  */
2264
2265   uint32_t size = INSTR (31, 31);
2266   uint32_t count = INSTR (15, 10);
2267   Shift shiftType = INSTR (23, 22);
2268
2269   NYI_assert (28, 24, 0x0B);
2270   NYI_assert (21, 21, 0);
2271
2272   /* Shift encoded as ROR is unallocated.  */
2273   if (shiftType == ROR)
2274     HALT_UNALLOC;
2275
2276   /* 32 bit operations must have count[5] = 0
2277      or else we have an UNALLOC.  */
2278   if (size == 0 && uimm (count, 5, 5))
2279     HALT_UNALLOC;
2280
2281   /* Dispatch on size:op i.e instr [31,29].  */
2282   switch (INSTR (31, 29))
2283     {
2284     case 0: add32_shift  (cpu, shiftType, count); break;
2285     case 1: adds32_shift (cpu, shiftType, count); break;
2286     case 2: sub32_shift  (cpu, shiftType, count); break;
2287     case 3: subs32_shift (cpu, shiftType, count); break;
2288     case 4: add64_shift  (cpu, shiftType, count); break;
2289     case 5: adds64_shift (cpu, shiftType, count); break;
2290     case 6: sub64_shift  (cpu, shiftType, count); break;
2291     case 7: subs64_shift (cpu, shiftType, count); break;
2292     }
2293 }
2294
2295 static void
2296 dexAddSubtractExtendedRegister (sim_cpu *cpu)
2297 {
2298   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2299      instr[30]    = op : 0 ==> ADD, 1 ==> SUB
2300      instr[29]    = set? : 0 ==> no flags, 1 ==> set flags
2301      instr[28,24] = 01011
2302      instr[23,22] = opt : 0 ==> ok, 1,2,3 ==> UNALLOC
2303      instr[21]    = 1
2304      instr[20,16] = Rm
2305      instr[15,13] = option : 000 ==> UXTB, 001 ==> UXTH,
2306                              000 ==> LSL|UXTW, 001 ==> UXTZ,
2307                              000 ==> SXTB, 001 ==> SXTH,
2308                              000 ==> SXTW, 001 ==> SXTX,
2309      instr[12,10] = shift : 0,1,2,3,4 ==> ok, 5,6,7 ==> UNALLOC
2310      instr[9,5]   = Rn
2311      instr[4,0]   = Rd  */
2312
2313   Extension extensionType = INSTR (15, 13);
2314   uint32_t shift = INSTR (12, 10);
2315
2316   NYI_assert (28, 24, 0x0B);
2317   NYI_assert (21, 21, 1);
2318
2319   /* Shift may not exceed 4.  */
2320   if (shift > 4)
2321     HALT_UNALLOC;
2322
2323   /* Dispatch on size:op:set?.  */
2324   switch (INSTR (31, 29))
2325     {
2326     case 0: add32_ext  (cpu, extensionType, shift); break;
2327     case 1: adds32_ext (cpu, extensionType, shift); break;
2328     case 2: sub32_ext  (cpu, extensionType, shift); break;
2329     case 3: subs32_ext (cpu, extensionType, shift); break;
2330     case 4: add64_ext  (cpu, extensionType, shift); break;
2331     case 5: adds64_ext (cpu, extensionType, shift); break;
2332     case 6: sub64_ext  (cpu, extensionType, shift); break;
2333     case 7: subs64_ext (cpu, extensionType, shift); break;
2334     }
2335 }
2336
2337 /* Conditional data processing
2338    Condition register is implicit 3rd source.  */
2339
2340 /* 32 bit add with carry.  */
2341 /* N.B register args may not be SP.  */
2342
2343 static void
2344 adc32 (sim_cpu *cpu)
2345 {
2346   unsigned rm = INSTR (20, 16);
2347   unsigned rn = INSTR (9, 5);
2348   unsigned rd = INSTR (4, 0);
2349
2350   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2351   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2352                        aarch64_get_reg_u32 (cpu, rn, NO_SP)
2353                        + aarch64_get_reg_u32 (cpu, rm, NO_SP)
2354                        + IS_SET (C));
2355 }
2356
2357 /* 64 bit add with carry  */
2358 static void
2359 adc64 (sim_cpu *cpu)
2360 {
2361   unsigned rm = INSTR (20, 16);
2362   unsigned rn = INSTR (9, 5);
2363   unsigned rd = INSTR (4, 0);
2364
2365   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2366   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2367                        aarch64_get_reg_u64 (cpu, rn, NO_SP)
2368                        + aarch64_get_reg_u64 (cpu, rm, NO_SP)
2369                        + IS_SET (C));
2370 }
2371
2372 /* 32 bit add with carry setting flags.  */
2373 static void
2374 adcs32 (sim_cpu *cpu)
2375 {
2376   unsigned rm = INSTR (20, 16);
2377   unsigned rn = INSTR (9, 5);
2378   unsigned rd = INSTR (4, 0);
2379
2380   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
2381   uint32_t value2 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
2382   uint32_t carry = IS_SET (C);
2383
2384   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2385   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2 + carry);
2386   set_flags_for_add32 (cpu, value1, value2 + carry);
2387 }
2388
2389 /* 64 bit add with carry setting flags.  */
2390 static void
2391 adcs64 (sim_cpu *cpu)
2392 {
2393   unsigned rm = INSTR (20, 16);
2394   unsigned rn = INSTR (9, 5);
2395   unsigned rd = INSTR (4, 0);
2396
2397   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
2398   uint64_t value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
2399   uint64_t carry = IS_SET (C);
2400
2401   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2402   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2 + carry);
2403   set_flags_for_add64 (cpu, value1, value2 + carry);
2404 }
2405
2406 /* 32 bit sub with carry.  */
2407 static void
2408 sbc32 (sim_cpu *cpu)
2409 {
2410   unsigned rm = INSTR (20, 16);
2411   unsigned rn = INSTR (9, 5); /* ngc iff rn == 31.  */
2412   unsigned rd = INSTR (4, 0);
2413
2414   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2415   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2416                        aarch64_get_reg_u32 (cpu, rn, NO_SP)
2417                        - aarch64_get_reg_u32 (cpu, rm, NO_SP)
2418                        - 1 + IS_SET (C));
2419 }
2420
2421 /* 64 bit sub with carry  */
2422 static void
2423 sbc64 (sim_cpu *cpu)
2424 {
2425   unsigned rm = INSTR (20, 16);
2426   unsigned rn = INSTR (9, 5);
2427   unsigned rd = INSTR (4, 0);
2428
2429   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2430   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2431                        aarch64_get_reg_u64 (cpu, rn, NO_SP)
2432                        - aarch64_get_reg_u64 (cpu, rm, NO_SP)
2433                        - 1 + IS_SET (C));
2434 }
2435
2436 /* 32 bit sub with carry setting flags  */
2437 static void
2438 sbcs32 (sim_cpu *cpu)
2439 {
2440   unsigned rm = INSTR (20, 16);
2441   unsigned rn = INSTR (9, 5);
2442   unsigned rd = INSTR (4, 0);
2443
2444   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
2445   uint32_t value2 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
2446   uint32_t carry  = IS_SET (C);
2447   uint32_t result = value1 - value2 + 1 - carry;
2448
2449   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2450   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
2451   set_flags_for_sub32 (cpu, value1, value2 + 1 - carry);
2452 }
2453
2454 /* 64 bit sub with carry setting flags  */
2455 static void
2456 sbcs64 (sim_cpu *cpu)
2457 {
2458   unsigned rm = INSTR (20, 16);
2459   unsigned rn = INSTR (9, 5);
2460   unsigned rd = INSTR (4, 0);
2461
2462   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
2463   uint64_t value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
2464   uint64_t carry  = IS_SET (C);
2465   uint64_t result = value1 - value2 + 1 - carry;
2466
2467   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2468   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
2469   set_flags_for_sub64 (cpu, value1, value2 + 1 - carry);
2470 }
2471
2472 static void
2473 dexAddSubtractWithCarry (sim_cpu *cpu)
2474 {
2475   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2476      instr[30]    = op : 0 ==> ADC, 1 ==> SBC
2477      instr[29]    = set? : 0 ==> no flags, 1 ==> set flags
2478      instr[28,21] = 1 1010 000
2479      instr[20,16] = Rm
2480      instr[15,10] = op2 : 00000 ==> ok, ow ==> UNALLOC
2481      instr[9,5]   = Rn
2482      instr[4,0]   = Rd  */
2483
2484   uint32_t op2 = INSTR (15, 10);
2485
2486   NYI_assert (28, 21, 0xD0);
2487
2488   if (op2 != 0)
2489     HALT_UNALLOC;
2490
2491   /* Dispatch on size:op:set?.  */
2492   switch (INSTR (31, 29))
2493     {
2494     case 0: adc32 (cpu); break;
2495     case 1: adcs32 (cpu); break;
2496     case 2: sbc32 (cpu); break;
2497     case 3: sbcs32 (cpu); break;
2498     case 4: adc64 (cpu); break;
2499     case 5: adcs64 (cpu); break;
2500     case 6: sbc64 (cpu); break;
2501     case 7: sbcs64 (cpu); break;
2502     }
2503 }
2504
2505 static uint32_t
2506 testConditionCode (sim_cpu *cpu, CondCode cc)
2507 {
2508   /* This should be reduceable to branchless logic
2509      by some careful testing of bits in CC followed
2510      by the requisite masking and combining of bits
2511      from the flag register.
2512
2513      For now we do it with a switch.  */
2514   int res;
2515
2516   switch (cc)
2517     {
2518     case EQ:  res = IS_SET (Z);    break;
2519     case NE:  res = IS_CLEAR (Z);  break;
2520     case CS:  res = IS_SET (C);    break;
2521     case CC:  res = IS_CLEAR (C);  break;
2522     case MI:  res = IS_SET (N);    break;
2523     case PL:  res = IS_CLEAR (N);  break;
2524     case VS:  res = IS_SET (V);    break;
2525     case VC:  res = IS_CLEAR (V);  break;
2526     case HI:  res = IS_SET (C) && IS_CLEAR (Z);  break;
2527     case LS:  res = IS_CLEAR (C) || IS_SET (Z);  break;
2528     case GE:  res = IS_SET (N) == IS_SET (V);    break;
2529     case LT:  res = IS_SET (N) != IS_SET (V);    break;
2530     case GT:  res = IS_CLEAR (Z) && (IS_SET (N) == IS_SET (V));  break;
2531     case LE:  res = IS_SET (Z) || (IS_SET (N) != IS_SET (V));    break;
2532     case AL:
2533     case NV:
2534     default:
2535       res = 1;
2536       break;
2537     }
2538   return res;
2539 }
2540
2541 static void
2542 CondCompare (sim_cpu *cpu) /* aka: ccmp and ccmn  */
2543 {
2544   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2545      instr[30]    = compare with positive (1) or negative value (0)
2546      instr[29,21] = 1 1101 0010
2547      instr[20,16] = Rm or const
2548      instr[15,12] = cond
2549      instr[11]    = compare reg (0) or const (1)
2550      instr[10]    = 0
2551      instr[9,5]   = Rn
2552      instr[4]     = 0
2553      instr[3,0]   = value for CPSR bits if the comparison does not take place.  */
2554   signed int negate;
2555   unsigned rm;
2556   unsigned rn;
2557
2558   NYI_assert (29, 21, 0x1d2);
2559   NYI_assert (10, 10, 0);
2560   NYI_assert (4, 4, 0);
2561
2562   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2563   if (! testConditionCode (cpu, INSTR (15, 12)))
2564     {
2565       aarch64_set_CPSR (cpu, INSTR (3, 0));
2566       return;
2567     }
2568
2569   negate = INSTR (30, 30) ? 1 : -1;
2570   rm = INSTR (20, 16);
2571   rn = INSTR ( 9,  5);
2572
2573   if (INSTR (31, 31))
2574     {
2575       if (INSTR (11, 11))
2576         set_flags_for_sub64 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK),
2577                              negate * (uint64_t) rm);
2578       else
2579         set_flags_for_sub64 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK),
2580                              negate * aarch64_get_reg_u64 (cpu, rm, SP_OK));
2581     }
2582   else
2583     {
2584       if (INSTR (11, 11))
2585         set_flags_for_sub32 (cpu, aarch64_get_reg_u32 (cpu, rn, SP_OK),
2586                              negate * rm);
2587       else
2588         set_flags_for_sub32 (cpu, aarch64_get_reg_u32 (cpu, rn, SP_OK),
2589                              negate * aarch64_get_reg_u32 (cpu, rm, SP_OK));
2590     }
2591 }
2592
2593 static void
2594 do_vec_MOV_whole_vector (sim_cpu *cpu)
2595 {
2596   /* MOV Vd.T, Vs.T  (alias for ORR Vd.T, Vn.T, Vm.T where Vn == Vm)
2597
2598      instr[31]    = 0
2599      instr[30]    = half(0)/full(1)
2600      instr[29,21] = 001110101
2601      instr[20,16] = Vs
2602      instr[15,10] = 000111
2603      instr[9,5]   = Vs
2604      instr[4,0]   = Vd  */
2605
2606   unsigned vs = INSTR (9, 5);
2607   unsigned vd = INSTR (4, 0);
2608
2609   NYI_assert (29, 21, 0x075);
2610   NYI_assert (15, 10, 0x07);
2611
2612   if (INSTR (20, 16) != vs)
2613     HALT_NYI;
2614
2615   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2616   if (INSTR (30, 30))
2617     aarch64_set_vec_u64 (cpu, vd, 1, aarch64_get_vec_u64 (cpu, vs, 1));
2618
2619   aarch64_set_vec_u64 (cpu, vd, 0, aarch64_get_vec_u64 (cpu, vs, 0));
2620 }
2621
2622 static void
2623 do_vec_MOV_into_scalar (sim_cpu *cpu)
2624 {
2625   /* instr[31]    = 0
2626      instr[30]    = word(0)/long(1)
2627      instr[29,21] = 00 1110 000
2628      instr[20,18] = element size and index
2629      instr[17,10] = 00 0011 11
2630      instr[9,5]   = V source
2631      instr[4,0]   = R dest  */
2632
2633   unsigned vs = INSTR (9, 5);
2634   unsigned rd = INSTR (4, 0);
2635
2636   NYI_assert (29, 21, 0x070);
2637   NYI_assert (17, 10, 0x0F);
2638
2639   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2640   switch (INSTR (20, 18))
2641     {
2642     case 0x2:
2643       aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u64 (cpu, vs, 0));
2644       break;
2645
2646     case 0x6:
2647       aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u64 (cpu, vs, 1));
2648       break;
2649
2650     case 0x1:
2651     case 0x3:
2652     case 0x5:
2653     case 0x7:
2654       aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u32
2655                            (cpu, vs, INSTR (20, 19)));
2656       break;
2657
2658     default:
2659       HALT_NYI;
2660     }
2661 }
2662
2663 static void
2664 do_vec_INS (sim_cpu *cpu)
2665 {
2666   /* instr[31,21] = 01001110000
2667      instr[20,16] = element size and index
2668      instr[15,10] = 000111
2669      instr[9,5]   = W source
2670      instr[4,0]   = V dest  */
2671
2672   int index;
2673   unsigned rs = INSTR (9, 5);
2674   unsigned vd = INSTR (4, 0);
2675
2676   NYI_assert (31, 21, 0x270);
2677   NYI_assert (15, 10, 0x07);
2678
2679   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2680   if (INSTR (16, 16))
2681     {
2682       index = INSTR (20, 17);
2683       aarch64_set_vec_u8 (cpu, vd, index,
2684                           aarch64_get_reg_u8 (cpu, rs, NO_SP));
2685     }
2686   else if (INSTR (17, 17))
2687     {
2688       index = INSTR (20, 18);
2689       aarch64_set_vec_u16 (cpu, vd, index,
2690                            aarch64_get_reg_u16 (cpu, rs, NO_SP));
2691     }
2692   else if (INSTR (18, 18))
2693     {
2694       index = INSTR (20, 19);
2695       aarch64_set_vec_u32 (cpu, vd, index,
2696                            aarch64_get_reg_u32 (cpu, rs, NO_SP));
2697     }
2698   else if (INSTR (19, 19))
2699     {
2700       index = INSTR (20, 20);
2701       aarch64_set_vec_u64 (cpu, vd, index,
2702                            aarch64_get_reg_u64 (cpu, rs, NO_SP));
2703     }
2704   else
2705     HALT_NYI;
2706 }
2707
2708 static void
2709 do_vec_DUP_vector_into_vector (sim_cpu *cpu)
2710 {
2711   /* instr[31]    = 0
2712      instr[30]    = half(0)/full(1)
2713      instr[29,21] = 00 1110 000
2714      instr[20,16] = element size and index
2715      instr[15,10] = 0000 01
2716      instr[9,5]   = V source
2717      instr[4,0]   = V dest.  */
2718
2719   unsigned full = INSTR (30, 30);
2720   unsigned vs = INSTR (9, 5);
2721   unsigned vd = INSTR (4, 0);
2722   int i, index;
2723
2724   NYI_assert (29, 21, 0x070);
2725   NYI_assert (15, 10, 0x01);
2726
2727   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2728   if (INSTR (16, 16))
2729     {
2730       index = INSTR (20, 17);
2731
2732       for (i = 0; i < (full ? 16 : 8); i++)
2733         aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vs, index));
2734     }
2735   else if (INSTR (17, 17))
2736     {
2737       index = INSTR (20, 18);
2738
2739       for (i = 0; i < (full ? 8 : 4); i++)
2740         aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vs, index));
2741     }
2742   else if (INSTR (18, 18))
2743     {
2744       index = INSTR (20, 19);
2745
2746       for (i = 0; i < (full ? 4 : 2); i++)
2747         aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vs, index));
2748     }
2749   else
2750     {
2751       if (INSTR (19, 19) == 0)
2752         HALT_UNALLOC;
2753
2754       if (! full)
2755         HALT_UNALLOC;
2756
2757       index = INSTR (20, 20);
2758
2759       for (i = 0; i < 2; i++)
2760         aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vs, index));
2761     }
2762 }
2763
2764 static void
2765 do_vec_TBL (sim_cpu *cpu)
2766 {
2767   /* instr[31]    = 0
2768      instr[30]    = half(0)/full(1)
2769      instr[29,21] = 00 1110 000
2770      instr[20,16] = Vm
2771      instr[15]    = 0
2772      instr[14,13] = vec length
2773      instr[12,10] = 000
2774      instr[9,5]   = V start
2775      instr[4,0]   = V dest  */
2776
2777   int full    = INSTR (30, 30);
2778   int len     = INSTR (14, 13) + 1;
2779   unsigned vm = INSTR (20, 16);
2780   unsigned vn = INSTR (9, 5);
2781   unsigned vd = INSTR (4, 0);
2782   unsigned i;
2783
2784   NYI_assert (29, 21, 0x070);
2785   NYI_assert (12, 10, 0);
2786
2787   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2788   for (i = 0; i < (full ? 16 : 8); i++)
2789     {
2790       unsigned int selector = aarch64_get_vec_u8 (cpu, vm, i);
2791       uint8_t val;
2792
2793       if (selector < 16)
2794         val = aarch64_get_vec_u8 (cpu, vn, selector);
2795       else if (selector < 32)
2796         val = len < 2 ? 0 : aarch64_get_vec_u8 (cpu, vn + 1, selector - 16);
2797       else if (selector < 48)
2798         val = len < 3 ? 0 : aarch64_get_vec_u8 (cpu, vn + 2, selector - 32);
2799       else if (selector < 64)
2800         val = len < 4 ? 0 : aarch64_get_vec_u8 (cpu, vn + 3, selector - 48);
2801       else
2802         val = 0;
2803
2804       aarch64_set_vec_u8 (cpu, vd, i, val);
2805     }
2806 }
2807
2808 static void
2809 do_vec_TRN (sim_cpu *cpu)
2810 {
2811   /* instr[31]    = 0
2812      instr[30]    = half(0)/full(1)
2813      instr[29,24] = 00 1110
2814      instr[23,22] = size
2815      instr[21]    = 0
2816      instr[20,16] = Vm
2817      instr[15]    = 0
2818      instr[14]    = TRN1 (0) / TRN2 (1)
2819      instr[13,10] = 1010
2820      instr[9,5]   = V source
2821      instr[4,0]   = V dest.  */
2822
2823   int full    = INSTR (30, 30);
2824   int second  = INSTR (14, 14);
2825   unsigned vm = INSTR (20, 16);
2826   unsigned vn = INSTR (9, 5);
2827   unsigned vd = INSTR (4, 0);
2828   unsigned i;
2829
2830   NYI_assert (29, 24, 0x0E);
2831   NYI_assert (13, 10, 0xA);
2832
2833   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2834   switch (INSTR (23, 22))
2835     {
2836     case 0:
2837       for (i = 0; i < (full ? 8 : 4); i++)
2838         {
2839           aarch64_set_vec_u8
2840             (cpu, vd, i * 2,
2841              aarch64_get_vec_u8 (cpu, second ? vm : vn, i * 2));
2842           aarch64_set_vec_u8
2843             (cpu, vd, 1 * 2 + 1,
2844              aarch64_get_vec_u8 (cpu, second ? vn : vm, i * 2 + 1));
2845         }
2846       break;
2847
2848     case 1:
2849       for (i = 0; i < (full ? 4 : 2); i++)
2850         {
2851           aarch64_set_vec_u16
2852             (cpu, vd, i * 2,
2853              aarch64_get_vec_u16 (cpu, second ? vm : vn, i * 2));
2854           aarch64_set_vec_u16
2855             (cpu, vd, 1 * 2 + 1,
2856              aarch64_get_vec_u16 (cpu, second ? vn : vm, i * 2 + 1));
2857         }
2858       break;
2859
2860     case 2:
2861       aarch64_set_vec_u32
2862         (cpu, vd, 0, aarch64_get_vec_u32 (cpu, second ? vm : vn, 0));
2863       aarch64_set_vec_u32
2864         (cpu, vd, 1, aarch64_get_vec_u32 (cpu, second ? vn : vm, 1));
2865       aarch64_set_vec_u32
2866         (cpu, vd, 2, aarch64_get_vec_u32 (cpu, second ? vm : vn, 2));
2867       aarch64_set_vec_u32
2868         (cpu, vd, 3, aarch64_get_vec_u32 (cpu, second ? vn : vm, 3));
2869       break;
2870
2871     case 3:
2872       if (! full)
2873         HALT_UNALLOC;
2874
2875       aarch64_set_vec_u64 (cpu, vd, 0,
2876                            aarch64_get_vec_u64 (cpu, second ? vm : vn, 0));
2877       aarch64_set_vec_u64 (cpu, vd, 1,
2878                            aarch64_get_vec_u64 (cpu, second ? vn : vm, 1));
2879       break;
2880     }
2881 }
2882
2883 static void
2884 do_vec_DUP_scalar_into_vector (sim_cpu *cpu)
2885 {
2886   /* instr[31]    = 0
2887      instr[30]    = 0=> zero top 64-bits, 1=> duplicate into top 64-bits
2888                     [must be 1 for 64-bit xfer]
2889      instr[29,20] = 00 1110 0000
2890      instr[19,16] = element size: 0001=> 8-bits, 0010=> 16-bits,
2891                                   0100=> 32-bits. 1000=>64-bits
2892      instr[15,10] = 0000 11
2893      instr[9,5]   = W source
2894      instr[4,0]   = V dest.  */
2895
2896   unsigned i;
2897   unsigned Vd = INSTR (4, 0);
2898   unsigned Rs = INSTR (9, 5);
2899   int both    = INSTR (30, 30);
2900
2901   NYI_assert (29, 20, 0x0E0);
2902   NYI_assert (15, 10, 0x03);
2903
2904   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2905   switch (INSTR (19, 16))
2906     {
2907     case 1:
2908       for (i = 0; i < (both ? 16 : 8); i++)
2909         aarch64_set_vec_u8 (cpu, Vd, i, aarch64_get_reg_u8 (cpu, Rs, NO_SP));
2910       break;
2911
2912     case 2:
2913       for (i = 0; i < (both ? 8 : 4); i++)
2914         aarch64_set_vec_u16 (cpu, Vd, i, aarch64_get_reg_u16 (cpu, Rs, NO_SP));
2915       break;
2916
2917     case 4:
2918       for (i = 0; i < (both ? 4 : 2); i++)
2919         aarch64_set_vec_u32 (cpu, Vd, i, aarch64_get_reg_u32 (cpu, Rs, NO_SP));
2920       break;
2921
2922     case 8:
2923       if (!both)
2924         HALT_NYI;
2925       aarch64_set_vec_u64 (cpu, Vd, 0, aarch64_get_reg_u64 (cpu, Rs, NO_SP));
2926       aarch64_set_vec_u64 (cpu, Vd, 1, aarch64_get_reg_u64 (cpu, Rs, NO_SP));
2927       break;
2928
2929     default:
2930       HALT_NYI;
2931     }
2932 }
2933
2934 static void
2935 do_vec_UZP (sim_cpu *cpu)
2936 {
2937   /* instr[31]    = 0
2938      instr[30]    = half(0)/full(1)
2939      instr[29,24] = 00 1110
2940      instr[23,22] = size: byte(00), half(01), word (10), long (11)
2941      instr[21]    = 0
2942      instr[20,16] = Vm
2943      instr[15]    = 0
2944      instr[14]    = lower (0) / upper (1)
2945      instr[13,10] = 0110
2946      instr[9,5]   = Vn
2947      instr[4,0]   = Vd.  */
2948
2949   int full = INSTR (30, 30);
2950   int upper = INSTR (14, 14);
2951
2952   unsigned vm = INSTR (20, 16);
2953   unsigned vn = INSTR (9, 5);
2954   unsigned vd = INSTR (4, 0);
2955
2956   uint64_t val_m1 = aarch64_get_vec_u64 (cpu, vm, 0);
2957   uint64_t val_m2 = aarch64_get_vec_u64 (cpu, vm, 1);
2958   uint64_t val_n1 = aarch64_get_vec_u64 (cpu, vn, 0);
2959   uint64_t val_n2 = aarch64_get_vec_u64 (cpu, vn, 1);
2960
2961   uint64_t val1;
2962   uint64_t val2;
2963
2964   uint64_t input2 = full ? val_n2 : val_m1;
2965
2966   NYI_assert (29, 24, 0x0E);
2967   NYI_assert (21, 21, 0);
2968   NYI_assert (15, 15, 0);
2969   NYI_assert (13, 10, 6);
2970
2971   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2972   switch (INSTR (23, 22))
2973     {
2974     case 0:
2975       val1 = (val_n1 >> (upper * 8)) & 0xFFULL;
2976       val1 |= (val_n1 >> ((upper * 8) + 8)) & 0xFF00ULL;
2977       val1 |= (val_n1 >> ((upper * 8) + 16)) & 0xFF0000ULL;
2978       val1 |= (val_n1 >> ((upper * 8) + 24)) & 0xFF000000ULL;
2979
2980       val1 |= (input2 << (32 - (upper * 8))) & 0xFF00000000ULL;
2981       val1 |= (input2 << (24 - (upper * 8))) & 0xFF0000000000ULL;
2982       val1 |= (input2 << (16 - (upper * 8))) & 0xFF000000000000ULL;
2983       val1 |= (input2 << (8 - (upper * 8))) & 0xFF00000000000000ULL;
2984
2985       if (full)
2986         {
2987           val2 = (val_m1 >> (upper * 8)) & 0xFFULL;
2988           val2 |= (val_m1 >> ((upper * 8) + 8)) & 0xFF00ULL;
2989           val2 |= (val_m1 >> ((upper * 8) + 16)) & 0xFF0000ULL;
2990           val2 |= (val_m1 >> ((upper * 8) + 24)) & 0xFF000000ULL;
2991
2992           val2 |= (val_m2 << (32 - (upper * 8))) & 0xFF00000000ULL;
2993           val2 |= (val_m2 << (24 - (upper * 8))) & 0xFF0000000000ULL;
2994           val2 |= (val_m2 << (16 - (upper * 8))) & 0xFF000000000000ULL;
2995           val2 |= (val_m2 << (8 - (upper * 8))) & 0xFF00000000000000ULL;
2996         }
2997       break;
2998
2999     case 1:
3000       val1 = (val_n1 >> (upper * 16)) & 0xFFFFULL;
3001       val1 |= (val_n1 >> ((upper * 16) + 16)) & 0xFFFF0000ULL;
3002
3003       val1 |= (input2 << (32 - (upper * 16))) & 0xFFFF00000000ULL;;
3004       val1 |= (input2 << (16 - (upper * 16))) & 0xFFFF000000000000ULL;
3005
3006       if (full)
3007         {
3008           val2 = (val_m1 >> (upper * 16)) & 0xFFFFULL;
3009           val2 |= (val_m1 >> ((upper * 16) + 16)) & 0xFFFF0000ULL;
3010
3011           val2 |= (val_m2 << (32 - (upper * 16))) & 0xFFFF00000000ULL;
3012           val2 |= (val_m2 << (16 - (upper * 16))) & 0xFFFF000000000000ULL;
3013         }
3014       break;
3015
3016     case 2:
3017       val1 = (val_n1 >> (upper * 32)) & 0xFFFFFFFF;
3018       val1 |= (input2 << (32 - (upper * 32))) & 0xFFFFFFFF00000000ULL;
3019
3020       if (full)
3021         {
3022           val2 = (val_m1 >> (upper * 32)) & 0xFFFFFFFF;
3023           val2 |= (val_m2 << (32 - (upper * 32))) & 0xFFFFFFFF00000000ULL;
3024         }
3025       break;
3026
3027     case 3:
3028       if (! full)
3029         HALT_UNALLOC;
3030
3031       val1 = upper ? val_n2 : val_n1;
3032       val2 = upper ? val_m2 : val_m1;
3033       break;
3034     }
3035
3036   aarch64_set_vec_u64 (cpu, vd, 0, val1);
3037   if (full)
3038     aarch64_set_vec_u64 (cpu, vd, 1, val2);
3039 }
3040
3041 static void
3042 do_vec_ZIP (sim_cpu *cpu)
3043 {
3044   /* instr[31]    = 0
3045      instr[30]    = half(0)/full(1)
3046      instr[29,24] = 00 1110
3047      instr[23,22] = size: byte(00), hald(01), word (10), long (11)
3048      instr[21]    = 0
3049      instr[20,16] = Vm
3050      instr[15]    = 0
3051      instr[14]    = lower (0) / upper (1)
3052      instr[13,10] = 1110
3053      instr[9,5]   = Vn
3054      instr[4,0]   = Vd.  */
3055
3056   int full = INSTR (30, 30);
3057   int upper = INSTR (14, 14);
3058
3059   unsigned vm = INSTR (20, 16);
3060   unsigned vn = INSTR (9, 5);
3061   unsigned vd = INSTR (4, 0);
3062
3063   uint64_t val_m1 = aarch64_get_vec_u64 (cpu, vm, 0);
3064   uint64_t val_m2 = aarch64_get_vec_u64 (cpu, vm, 1);
3065   uint64_t val_n1 = aarch64_get_vec_u64 (cpu, vn, 0);
3066   uint64_t val_n2 = aarch64_get_vec_u64 (cpu, vn, 1);
3067
3068   uint64_t val1 = 0;
3069   uint64_t val2 = 0;
3070
3071   uint64_t input1 = upper ? val_n1 : val_m1;
3072   uint64_t input2 = upper ? val_n2 : val_m2;
3073
3074   NYI_assert (29, 24, 0x0E);
3075   NYI_assert (21, 21, 0);
3076   NYI_assert (15, 15, 0);
3077   NYI_assert (13, 10, 0xE);
3078
3079   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3080   switch (INSTR (23, 23))
3081     {
3082     case 0:
3083       val1 =
3084           ((input1 <<  0) & (0xFF    <<  0))
3085         | ((input2 <<  8) & (0xFF    <<  8))
3086         | ((input1 <<  8) & (0xFF    << 16))
3087         | ((input2 << 16) & (0xFF    << 24))
3088         | ((input1 << 16) & (0xFFULL << 32))
3089         | ((input2 << 24) & (0xFFULL << 40))
3090         | ((input1 << 24) & (0xFFULL << 48))
3091         | ((input2 << 32) & (0xFFULL << 56));
3092
3093       val2 =
3094           ((input1 >> 32) & (0xFF    <<  0))
3095         | ((input2 >> 24) & (0xFF    <<  8))
3096         | ((input1 >> 24) & (0xFF    << 16))
3097         | ((input2 >> 16) & (0xFF    << 24))
3098         | ((input1 >> 16) & (0xFFULL << 32))
3099         | ((input2 >>  8) & (0xFFULL << 40))
3100         | ((input1 >>  8) & (0xFFULL << 48))
3101         | ((input2 >>  0) & (0xFFULL << 56));
3102       break;
3103
3104     case 1:
3105       val1 =
3106           ((input1 <<  0) & (0xFFFF    <<  0))
3107         | ((input2 << 16) & (0xFFFF    << 16))
3108         | ((input1 << 16) & (0xFFFFULL << 32))
3109         | ((input2 << 32) & (0xFFFFULL << 48));
3110
3111       val2 =
3112           ((input1 >> 32) & (0xFFFF    <<  0))
3113         | ((input2 >> 16) & (0xFFFF    << 16))
3114         | ((input1 >> 16) & (0xFFFFULL << 32))
3115         | ((input2 >>  0) & (0xFFFFULL << 48));
3116       break;
3117
3118     case 2:
3119       val1 = (input1 & 0xFFFFFFFFULL) | (input2 << 32);
3120       val2 = (input2 & 0xFFFFFFFFULL) | (input1 << 32);
3121       break;
3122
3123     case 3:
3124       val1 = input1;
3125       val2 = input2;
3126       break;
3127     }
3128
3129   aarch64_set_vec_u64 (cpu, vd, 0, val1);
3130   if (full)
3131     aarch64_set_vec_u64 (cpu, vd, 1, val2);
3132 }
3133
3134 /* Floating point immediates are encoded in 8 bits.
3135    fpimm[7] = sign bit.
3136    fpimm[6:4] = signed exponent.
3137    fpimm[3:0] = fraction (assuming leading 1).
3138    i.e. F = s * 1.f * 2^(e - b).  */
3139
3140 static float
3141 fp_immediate_for_encoding_32 (uint32_t imm8)
3142 {
3143   float u;
3144   uint32_t s, e, f, i;
3145
3146   s = (imm8 >> 7) & 0x1;
3147   e = (imm8 >> 4) & 0x7;
3148   f = imm8 & 0xf;
3149
3150   /* The fp value is s * n/16 * 2r where n is 16+e.  */
3151   u = (16.0 + f) / 16.0;
3152
3153   /* N.B. exponent is signed.  */
3154   if (e < 4)
3155     {
3156       int epos = e;
3157
3158       for (i = 0; i <= epos; i++)
3159         u *= 2.0;
3160     }
3161   else
3162     {
3163       int eneg = 7 - e;
3164
3165       for (i = 0; i < eneg; i++)
3166         u /= 2.0;
3167     }
3168
3169   if (s)
3170     u = - u;
3171
3172   return u;
3173 }
3174
3175 static double
3176 fp_immediate_for_encoding_64 (uint32_t imm8)
3177 {
3178   double u;
3179   uint32_t s, e, f, i;
3180
3181   s = (imm8 >> 7) & 0x1;
3182   e = (imm8 >> 4) & 0x7;
3183   f = imm8 & 0xf;
3184
3185   /* The fp value is s * n/16 * 2r where n is 16+e.  */
3186   u = (16.0 + f) / 16.0;
3187
3188   /* N.B. exponent is signed.  */
3189   if (e < 4)
3190     {
3191       int epos = e;
3192
3193       for (i = 0; i <= epos; i++)
3194         u *= 2.0;
3195     }
3196   else
3197     {
3198       int eneg = 7 - e;
3199
3200       for (i = 0; i < eneg; i++)
3201         u /= 2.0;
3202     }
3203
3204   if (s)
3205     u = - u;
3206
3207   return u;
3208 }
3209
3210 static void
3211 do_vec_MOV_immediate (sim_cpu *cpu)
3212 {
3213   /* instr[31]    = 0
3214      instr[30]    = full/half selector
3215      instr[29,19] = 00111100000
3216      instr[18,16] = high 3 bits of uimm8
3217      instr[15,12] = size & shift:
3218                                   0000 => 32-bit
3219                                   0010 => 32-bit + LSL#8
3220                                   0100 => 32-bit + LSL#16
3221                                   0110 => 32-bit + LSL#24
3222                                   1010 => 16-bit + LSL#8
3223                                   1000 => 16-bit
3224                                   1101 => 32-bit + MSL#16
3225                                   1100 => 32-bit + MSL#8
3226                                   1110 => 8-bit
3227                                   1111 => double
3228      instr[11,10] = 01
3229      instr[9,5]   = low 5-bits of uimm8
3230      instr[4,0]   = Vd.  */
3231
3232   int full     = INSTR (30, 30);
3233   unsigned vd  = INSTR (4, 0);
3234   unsigned val = (INSTR (18, 16) << 5) | INSTR (9, 5);
3235   unsigned i;
3236
3237   NYI_assert (29, 19, 0x1E0);
3238   NYI_assert (11, 10, 1);
3239
3240   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3241   switch (INSTR (15, 12))
3242     {
3243     case 0x0: /* 32-bit, no shift.  */
3244     case 0x2: /* 32-bit, shift by 8.  */
3245     case 0x4: /* 32-bit, shift by 16.  */
3246     case 0x6: /* 32-bit, shift by 24.  */
3247       val <<= (8 * INSTR (14, 13));
3248       for (i = 0; i < (full ? 4 : 2); i++)
3249         aarch64_set_vec_u32 (cpu, vd, i, val);
3250       break;
3251
3252     case 0xa: /* 16-bit, shift by 8.  */
3253       val <<= 8;
3254       /* Fall through.  */
3255     case 0x8: /* 16-bit, no shift.  */
3256       for (i = 0; i < (full ? 8 : 4); i++)
3257         aarch64_set_vec_u16 (cpu, vd, i, val);
3258       break;
3259
3260     case 0xd: /* 32-bit, mask shift by 16.  */
3261       val <<= 8;
3262       val |= 0xFF;
3263       /* Fall through.  */
3264     case 0xc: /* 32-bit, mask shift by 8. */
3265       val <<= 8;
3266       val |= 0xFF;
3267       for (i = 0; i < (full ? 4 : 2); i++)
3268         aarch64_set_vec_u32 (cpu, vd, i, val);
3269       break;
3270
3271     case 0xe: /* 8-bit, no shift.  */
3272       for (i = 0; i < (full ? 16 : 8); i++)
3273         aarch64_set_vec_u8 (cpu, vd, i, val);
3274       break;
3275
3276     case 0xf: /* FMOV Vs.{2|4}S, #fpimm.  */
3277       {
3278         float u = fp_immediate_for_encoding_32 (val);
3279         for (i = 0; i < (full ? 4 : 2); i++)
3280           aarch64_set_vec_float (cpu, vd, i, u);
3281         break;
3282       }
3283
3284     default:
3285       HALT_NYI;
3286     }
3287 }
3288
3289 static void
3290 do_vec_MVNI (sim_cpu *cpu)
3291 {
3292   /* instr[31]    = 0
3293      instr[30]    = full/half selector
3294      instr[29,19] = 10111100000
3295      instr[18,16] = high 3 bits of uimm8
3296      instr[15,12] = selector
3297      instr[11,10] = 01
3298      instr[9,5]   = low 5-bits of uimm8
3299      instr[4,0]   = Vd.  */
3300
3301   int full     = INSTR (30, 30);
3302   unsigned vd  = INSTR (4, 0);
3303   unsigned val = (INSTR (18, 16) << 5) | INSTR (9, 5);
3304   unsigned i;
3305
3306   NYI_assert (29, 19, 0x5E0);
3307   NYI_assert (11, 10, 1);
3308
3309   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3310   switch (INSTR (15, 12))
3311     {
3312     case 0x0: /* 32-bit, no shift.  */
3313     case 0x2: /* 32-bit, shift by 8.  */
3314     case 0x4: /* 32-bit, shift by 16.  */
3315     case 0x6: /* 32-bit, shift by 24.  */
3316       val <<= (8 * INSTR (14, 13));
3317       val = ~ val;
3318       for (i = 0; i < (full ? 4 : 2); i++)
3319         aarch64_set_vec_u32 (cpu, vd, i, val);
3320       return;
3321
3322     case 0xa: /* 16-bit, 8 bit shift. */
3323       val <<= 8;
3324     case 0x8: /* 16-bit, no shift. */
3325       val = ~ val;
3326       for (i = 0; i < (full ? 8 : 4); i++)
3327         aarch64_set_vec_u16 (cpu, vd, i, val);
3328       return;
3329
3330     case 0xd: /* 32-bit, mask shift by 16.  */
3331       val <<= 8;
3332       val |= 0xFF;
3333     case 0xc: /* 32-bit, mask shift by 8. */
3334       val <<= 8;
3335       val |= 0xFF;
3336       val = ~ val;
3337       for (i = 0; i < (full ? 4 : 2); i++)
3338         aarch64_set_vec_u32 (cpu, vd, i, val);
3339       return;
3340
3341     case 0xE: /* MOVI Dn, #mask64 */
3342       {
3343         uint64_t mask = 0;
3344
3345         for (i = 0; i < 8; i++)
3346           if (val & (1 << i))
3347             mask |= (0xFFUL << (i * 8));
3348         aarch64_set_vec_u64 (cpu, vd, 0, mask);
3349         aarch64_set_vec_u64 (cpu, vd, 1, mask);
3350         return;
3351       }
3352
3353     case 0xf: /* FMOV Vd.2D, #fpimm.  */
3354       {
3355         double u = fp_immediate_for_encoding_64 (val);
3356
3357         if (! full)
3358           HALT_UNALLOC;
3359
3360         aarch64_set_vec_double (cpu, vd, 0, u);
3361         aarch64_set_vec_double (cpu, vd, 1, u);
3362         return;
3363       }
3364
3365     default:
3366       HALT_NYI;
3367     }
3368 }
3369
3370 #define ABS(A) ((A) < 0 ? - (A) : (A))
3371
3372 static void
3373 do_vec_ABS (sim_cpu *cpu)
3374 {
3375   /* instr[31]    = 0
3376      instr[30]    = half(0)/full(1)
3377      instr[29,24] = 00 1110
3378      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
3379      instr[21,10] = 10 0000 1011 10
3380      instr[9,5]   = Vn
3381      instr[4.0]   = Vd.  */
3382
3383   unsigned vn = INSTR (9, 5);
3384   unsigned vd = INSTR (4, 0);
3385   unsigned full = INSTR (30, 30);
3386   unsigned i;
3387
3388   NYI_assert (29, 24, 0x0E);
3389   NYI_assert (21, 10, 0x82E);
3390
3391   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3392   switch (INSTR (23, 22))
3393     {
3394     case 0:
3395       for (i = 0; i < (full ? 16 : 8); i++)
3396         aarch64_set_vec_s8 (cpu, vd, i,
3397                             ABS (aarch64_get_vec_s8 (cpu, vn, i)));
3398       break;
3399
3400     case 1:
3401       for (i = 0; i < (full ? 8 : 4); i++)
3402         aarch64_set_vec_s16 (cpu, vd, i,
3403                              ABS (aarch64_get_vec_s16 (cpu, vn, i)));
3404       break;
3405
3406     case 2:
3407       for (i = 0; i < (full ? 4 : 2); i++)
3408         aarch64_set_vec_s32 (cpu, vd, i,
3409                              ABS (aarch64_get_vec_s32 (cpu, vn, i)));
3410       break;
3411
3412     case 3:
3413       if (! full)
3414         HALT_NYI;
3415       for (i = 0; i < 2; i++)
3416         aarch64_set_vec_s64 (cpu, vd, i,
3417                              ABS (aarch64_get_vec_s64 (cpu, vn, i)));
3418       break;
3419     }
3420 }
3421
3422 static void
3423 do_vec_ADDV (sim_cpu *cpu)
3424 {
3425   /* instr[31]    = 0
3426      instr[30]    = full/half selector
3427      instr[29,24] = 00 1110
3428      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
3429      instr[21,10] = 11 0001 1011 10
3430      instr[9,5]   = Vm
3431      instr[4.0]   = Rd.  */
3432
3433   unsigned vm = INSTR (9, 5);
3434   unsigned rd = INSTR (4, 0);
3435   unsigned i;
3436   int      full = INSTR (30, 30);
3437
3438   NYI_assert (29, 24, 0x0E);
3439   NYI_assert (21, 10, 0xC6E);
3440
3441   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3442   switch (INSTR (23, 22))
3443     {
3444     case 0:
3445       {
3446         uint8_t val = 0;
3447         for (i = 0; i < (full ? 16 : 8); i++)
3448           val += aarch64_get_vec_u8 (cpu, vm, i);
3449         aarch64_set_vec_u64 (cpu, rd, 0, val);
3450         return;
3451       }
3452
3453     case 1:
3454       {
3455         uint16_t val = 0;
3456         for (i = 0; i < (full ? 8 : 4); i++)
3457           val += aarch64_get_vec_u16 (cpu, vm, i);
3458         aarch64_set_vec_u64 (cpu, rd, 0, val);
3459         return;
3460       }
3461
3462     case 2:
3463       {
3464         uint32_t val = 0;
3465         if (! full)
3466           HALT_UNALLOC;
3467         for (i = 0; i < 4; i++)
3468           val += aarch64_get_vec_u32 (cpu, vm, i);
3469         aarch64_set_vec_u64 (cpu, rd, 0, val);
3470         return;
3471       }
3472
3473     case 3:
3474       HALT_UNALLOC;
3475     }
3476 }
3477
3478 static void
3479 do_vec_ins_2 (sim_cpu *cpu)
3480 {
3481   /* instr[31,21] = 01001110000
3482      instr[20,18] = size & element selector
3483      instr[17,14] = 0000
3484      instr[13]    = direction: to vec(0), from vec (1)
3485      instr[12,10] = 111
3486      instr[9,5]   = Vm
3487      instr[4,0]   = Vd.  */
3488
3489   unsigned elem;
3490   unsigned vm = INSTR (9, 5);
3491   unsigned vd = INSTR (4, 0);
3492
3493   NYI_assert (31, 21, 0x270);
3494   NYI_assert (17, 14, 0);
3495   NYI_assert (12, 10, 7);
3496
3497   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3498   if (INSTR (13, 13) == 1)
3499     {
3500       if (INSTR (18, 18) == 1)
3501         {
3502           /* 32-bit moves.  */
3503           elem = INSTR (20, 19);
3504           aarch64_set_reg_u64 (cpu, vd, NO_SP,
3505                                aarch64_get_vec_u32 (cpu, vm, elem));
3506         }
3507       else
3508         {
3509           /* 64-bit moves.  */
3510           if (INSTR (19, 19) != 1)
3511             HALT_NYI;
3512
3513           elem = INSTR (20, 20);
3514           aarch64_set_reg_u64 (cpu, vd, NO_SP,
3515                                aarch64_get_vec_u64 (cpu, vm, elem));
3516         }
3517     }
3518   else
3519     {
3520       if (INSTR (18, 18) == 1)
3521         {
3522           /* 32-bit moves.  */
3523           elem = INSTR (20, 19);
3524           aarch64_set_vec_u32 (cpu, vd, elem,
3525                                aarch64_get_reg_u32 (cpu, vm, NO_SP));
3526         }
3527       else
3528         {
3529           /* 64-bit moves.  */
3530           if (INSTR (19, 19) != 1)
3531             HALT_NYI;
3532
3533           elem = INSTR (20, 20);
3534           aarch64_set_vec_u64 (cpu, vd, elem,
3535                                aarch64_get_reg_u64 (cpu, vm, NO_SP));
3536         }
3537     }
3538 }
3539
3540 #define DO_VEC_WIDENING_MUL(N, DST_TYPE, READ_TYPE, WRITE_TYPE)   \
3541   do                                                              \
3542     {                                                             \
3543       DST_TYPE a[N], b[N];                                        \
3544                                                                   \
3545       for (i = 0; i < (N); i++)                                   \
3546         {                                                         \
3547           a[i] = aarch64_get_vec_##READ_TYPE (cpu, vn, i + bias); \
3548           b[i] = aarch64_get_vec_##READ_TYPE (cpu, vm, i + bias); \
3549         }                                                         \
3550       for (i = 0; i < (N); i++)                                   \
3551         aarch64_set_vec_##WRITE_TYPE (cpu, vd, i, a[i] * b[i]);   \
3552     }                                                             \
3553   while (0)
3554
3555 static void
3556 do_vec_mull (sim_cpu *cpu)
3557 {
3558   /* instr[31]    = 0
3559      instr[30]    = lower(0)/upper(1) selector
3560      instr[29]    = signed(0)/unsigned(1)
3561      instr[28,24] = 0 1110
3562      instr[23,22] = size: 8-bit (00), 16-bit (01), 32-bit (10)
3563      instr[21]    = 1
3564      instr[20,16] = Vm
3565      instr[15,10] = 11 0000
3566      instr[9,5]   = Vn
3567      instr[4.0]   = Vd.  */
3568
3569   int    unsign = INSTR (29, 29);
3570   int    bias = INSTR (30, 30);
3571   unsigned vm = INSTR (20, 16);
3572   unsigned vn = INSTR ( 9,  5);
3573   unsigned vd = INSTR ( 4,  0);
3574   unsigned i;
3575
3576   NYI_assert (28, 24, 0x0E);
3577   NYI_assert (15, 10, 0x30);
3578
3579   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3580   /* NB: Read source values before writing results, in case
3581      the source and destination vectors are the same.  */
3582   switch (INSTR (23, 22))
3583     {
3584     case 0:
3585       if (bias)
3586         bias = 8;
3587       if (unsign)
3588         DO_VEC_WIDENING_MUL (8, uint16_t, u8, u16);
3589       else
3590         DO_VEC_WIDENING_MUL (8, int16_t, s8, s16);
3591       return;
3592
3593     case 1:
3594       if (bias)
3595         bias = 4;
3596       if (unsign)
3597         DO_VEC_WIDENING_MUL (4, uint32_t, u16, u32);
3598       else
3599         DO_VEC_WIDENING_MUL (4, int32_t, s16, s32);
3600       return;
3601
3602     case 2:
3603       if (bias)
3604         bias = 2;
3605       if (unsign)
3606         DO_VEC_WIDENING_MUL (2, uint64_t, u32, u64);
3607       else
3608         DO_VEC_WIDENING_MUL (2, int64_t, s32, s64);
3609       return;
3610
3611     case 3:
3612       HALT_NYI;
3613     }
3614 }
3615
3616 static void
3617 do_vec_fadd (sim_cpu *cpu)
3618 {
3619   /* instr[31]    = 0
3620      instr[30]    = half(0)/full(1)
3621      instr[29,24] = 001110
3622      instr[23]    = FADD(0)/FSUB(1)
3623      instr[22]    = float (0)/double(1)
3624      instr[21]    = 1
3625      instr[20,16] = Vm
3626      instr[15,10] = 110101
3627      instr[9,5]   = Vn
3628      instr[4.0]   = Vd.  */
3629
3630   unsigned vm = INSTR (20, 16);
3631   unsigned vn = INSTR (9, 5);
3632   unsigned vd = INSTR (4, 0);
3633   unsigned i;
3634   int      full = INSTR (30, 30);
3635
3636   NYI_assert (29, 24, 0x0E);
3637   NYI_assert (21, 21, 1);
3638   NYI_assert (15, 10, 0x35);
3639
3640   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3641   if (INSTR (23, 23))
3642     {
3643       if (INSTR (22, 22))
3644         {
3645           if (! full)
3646             HALT_NYI;
3647
3648           for (i = 0; i < 2; i++)
3649             aarch64_set_vec_double (cpu, vd, i,
3650                                     aarch64_get_vec_double (cpu, vn, i)
3651                                     - aarch64_get_vec_double (cpu, vm, i));
3652         }
3653       else
3654         {
3655           for (i = 0; i < (full ? 4 : 2); i++)
3656             aarch64_set_vec_float (cpu, vd, i,
3657                                    aarch64_get_vec_float (cpu, vn, i)
3658                                    - aarch64_get_vec_float (cpu, vm, i));
3659         }
3660     }
3661   else
3662     {
3663       if (INSTR (22, 22))
3664         {
3665           if (! full)
3666             HALT_NYI;
3667
3668           for (i = 0; i < 2; i++)
3669             aarch64_set_vec_double (cpu, vd, i,
3670                                     aarch64_get_vec_double (cpu, vm, i)
3671                                     + aarch64_get_vec_double (cpu, vn, i));
3672         }
3673       else
3674         {
3675           for (i = 0; i < (full ? 4 : 2); i++)
3676             aarch64_set_vec_float (cpu, vd, i,
3677                                    aarch64_get_vec_float (cpu, vm, i)
3678                                    + aarch64_get_vec_float (cpu, vn, i));
3679         }
3680     }
3681 }
3682
3683 static void
3684 do_vec_add (sim_cpu *cpu)
3685 {
3686   /* instr[31]    = 0
3687      instr[30]    = full/half selector
3688      instr[29,24] = 001110
3689      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
3690      instr[21]    = 1
3691      instr[20,16] = Vn
3692      instr[15,10] = 100001
3693      instr[9,5]   = Vm
3694      instr[4.0]   = Vd.  */
3695
3696   unsigned vm = INSTR (20, 16);
3697   unsigned vn = INSTR (9, 5);
3698   unsigned vd = INSTR (4, 0);
3699   unsigned i;
3700   int      full = INSTR (30, 30);
3701
3702   NYI_assert (29, 24, 0x0E);
3703   NYI_assert (21, 21, 1);
3704   NYI_assert (15, 10, 0x21);
3705
3706   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3707   switch (INSTR (23, 22))
3708     {
3709     case 0:
3710       for (i = 0; i < (full ? 16 : 8); i++)
3711         aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
3712                             + aarch64_get_vec_u8 (cpu, vm, i));
3713       return;
3714
3715     case 1:
3716       for (i = 0; i < (full ? 8 : 4); i++)
3717         aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
3718                              + aarch64_get_vec_u16 (cpu, vm, i));
3719       return;
3720
3721     case 2:
3722       for (i = 0; i < (full ? 4 : 2); i++)
3723         aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
3724                              + aarch64_get_vec_u32 (cpu, vm, i));
3725       return;
3726
3727     case 3:
3728       if (! full)
3729         HALT_UNALLOC;
3730       aarch64_set_vec_u64 (cpu, vd, 0, aarch64_get_vec_u64 (cpu, vn, 0)
3731                            + aarch64_get_vec_u64 (cpu, vm, 0));
3732       aarch64_set_vec_u64 (cpu, vd, 1,
3733                            aarch64_get_vec_u64 (cpu, vn, 1)
3734                            + aarch64_get_vec_u64 (cpu, vm, 1));
3735       return;
3736     }
3737 }
3738
3739 static void
3740 do_vec_mul (sim_cpu *cpu)
3741 {
3742   /* instr[31]    = 0
3743      instr[30]    = full/half selector
3744      instr[29,24] = 00 1110
3745      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
3746      instr[21]    = 1
3747      instr[20,16] = Vn
3748      instr[15,10] = 10 0111
3749      instr[9,5]   = Vm
3750      instr[4.0]   = Vd.  */
3751
3752   unsigned vm = INSTR (20, 16);
3753   unsigned vn = INSTR (9, 5);
3754   unsigned vd = INSTR (4, 0);
3755   unsigned i;
3756   int      full = INSTR (30, 30);
3757   int      bias = 0;
3758
3759   NYI_assert (29, 24, 0x0E);
3760   NYI_assert (21, 21, 1);
3761   NYI_assert (15, 10, 0x27);
3762
3763   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3764   switch (INSTR (23, 22))
3765     {
3766     case 0:
3767       DO_VEC_WIDENING_MUL (full ? 16 : 8, uint8_t, u8, u8);
3768       return;
3769
3770     case 1:
3771       DO_VEC_WIDENING_MUL (full ? 8 : 4, uint16_t, u16, u16);
3772       return;
3773
3774     case 2:
3775       DO_VEC_WIDENING_MUL (full ? 4 : 2, uint32_t, u32, u32);
3776       return;
3777
3778     case 3:
3779       HALT_UNALLOC;
3780     }
3781 }
3782
3783 static void
3784 do_vec_MLA (sim_cpu *cpu)
3785 {
3786   /* instr[31]    = 0
3787      instr[30]    = full/half selector
3788      instr[29,24] = 00 1110
3789      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
3790      instr[21]    = 1
3791      instr[20,16] = Vn
3792      instr[15,10] = 1001 01
3793      instr[9,5]   = Vm
3794      instr[4.0]   = Vd.  */
3795
3796   unsigned vm = INSTR (20, 16);
3797   unsigned vn = INSTR (9, 5);
3798   unsigned vd = INSTR (4, 0);
3799   unsigned i;
3800   int      full = INSTR (30, 30);
3801
3802   NYI_assert (29, 24, 0x0E);
3803   NYI_assert (21, 21, 1);
3804   NYI_assert (15, 10, 0x25);
3805
3806   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3807   switch (INSTR (23, 22))
3808     {
3809     case 0:
3810       for (i = 0; i < (full ? 16 : 8); i++)
3811         aarch64_set_vec_u8 (cpu, vd, i,
3812                             aarch64_get_vec_u8 (cpu, vd, i)
3813                             + (aarch64_get_vec_u8 (cpu, vn, i)
3814                                * aarch64_get_vec_u8 (cpu, vm, i)));
3815       return;
3816
3817     case 1:
3818       for (i = 0; i < (full ? 8 : 4); i++)
3819         aarch64_set_vec_u16 (cpu, vd, i,
3820                              aarch64_get_vec_u16 (cpu, vd, i)
3821                              + (aarch64_get_vec_u16 (cpu, vn, i)
3822                                 * aarch64_get_vec_u16 (cpu, vm, i)));
3823       return;
3824
3825     case 2:
3826       for (i = 0; i < (full ? 4 : 2); i++)
3827         aarch64_set_vec_u32 (cpu, vd, i,
3828                              aarch64_get_vec_u32 (cpu, vd, i)
3829                              + (aarch64_get_vec_u32 (cpu, vn, i)
3830                                 * aarch64_get_vec_u32 (cpu, vm, i)));
3831       return;
3832
3833     default:
3834       HALT_UNALLOC;
3835     }
3836 }
3837
3838 static float
3839 fmaxnm (float a, float b)
3840 {
3841   if (! isnan (a))
3842     {
3843       if (! isnan (b))
3844         return a > b ? a : b;
3845       return a;
3846     }
3847   else if (! isnan (b))
3848     return b;
3849   return a;
3850 }
3851
3852 static float
3853 fminnm (float a, float b)
3854 {
3855   if (! isnan (a))
3856     {
3857       if (! isnan (b))
3858         return a < b ? a : b;
3859       return a;
3860     }
3861   else if (! isnan (b))
3862     return b;
3863   return a;
3864 }
3865
3866 static double
3867 dmaxnm (double a, double b)
3868 {
3869   if (! isnan (a))
3870     {
3871       if (! isnan (b))
3872         return a > b ? a : b;
3873       return a;
3874     }
3875   else if (! isnan (b))
3876     return b;
3877   return a;
3878 }
3879
3880 static double
3881 dminnm (double a, double b)
3882 {
3883   if (! isnan (a))
3884     {
3885       if (! isnan (b))
3886         return a < b ? a : b;
3887       return a;
3888     }
3889   else if (! isnan (b))
3890     return b;
3891   return a;
3892 }
3893
3894 static void
3895 do_vec_FminmaxNMP (sim_cpu *cpu)
3896 {
3897   /* instr [31]    = 0
3898      instr [30]    = half (0)/full (1)
3899      instr [29,24] = 10 1110
3900      instr [23]    = max(0)/min(1)
3901      instr [22]    = float (0)/double (1)
3902      instr [21]    = 1
3903      instr [20,16] = Vn
3904      instr [15,10] = 1100 01
3905      instr [9,5]   = Vm
3906      instr [4.0]   = Vd.  */
3907
3908   unsigned vm = INSTR (20, 16);
3909   unsigned vn = INSTR (9, 5);
3910   unsigned vd = INSTR (4, 0);
3911   int      full = INSTR (30, 30);
3912
3913   NYI_assert (29, 24, 0x2E);
3914   NYI_assert (21, 21, 1);
3915   NYI_assert (15, 10, 0x31);
3916
3917   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3918   if (INSTR (22, 22))
3919     {
3920       double (* fn)(double, double) = INSTR (23, 23)
3921         ? dminnm : dmaxnm;
3922
3923       if (! full)
3924         HALT_NYI;
3925       aarch64_set_vec_double (cpu, vd, 0,
3926                               fn (aarch64_get_vec_double (cpu, vn, 0),
3927                                   aarch64_get_vec_double (cpu, vn, 1)));
3928       aarch64_set_vec_double (cpu, vd, 0,
3929                               fn (aarch64_get_vec_double (cpu, vm, 0),
3930                                   aarch64_get_vec_double (cpu, vm, 1)));
3931     }
3932   else
3933     {
3934       float (* fn)(float, float) = INSTR (23, 23)
3935         ? fminnm : fmaxnm;
3936
3937       aarch64_set_vec_float (cpu, vd, 0,
3938                              fn (aarch64_get_vec_float (cpu, vn, 0),
3939                                  aarch64_get_vec_float (cpu, vn, 1)));
3940       if (full)
3941         aarch64_set_vec_float (cpu, vd, 1,
3942                                fn (aarch64_get_vec_float (cpu, vn, 2),
3943                                    aarch64_get_vec_float (cpu, vn, 3)));
3944
3945       aarch64_set_vec_float (cpu, vd, (full ? 2 : 1),
3946                              fn (aarch64_get_vec_float (cpu, vm, 0),
3947                                  aarch64_get_vec_float (cpu, vm, 1)));
3948       if (full)
3949         aarch64_set_vec_float (cpu, vd, 3,
3950                                fn (aarch64_get_vec_float (cpu, vm, 2),
3951                                    aarch64_get_vec_float (cpu, vm, 3)));
3952     }
3953 }
3954
3955 static void
3956 do_vec_AND (sim_cpu *cpu)
3957 {
3958   /* instr[31]    = 0
3959      instr[30]    = half (0)/full (1)
3960      instr[29,21] = 001110001
3961      instr[20,16] = Vm
3962      instr[15,10] = 000111
3963      instr[9,5]   = Vn
3964      instr[4.0]   = Vd.  */
3965
3966   unsigned vm = INSTR (20, 16);
3967   unsigned vn = INSTR (9, 5);
3968   unsigned vd = INSTR (4, 0);
3969   unsigned i;
3970   int      full = INSTR (30, 30);
3971
3972   NYI_assert (29, 21, 0x071);
3973   NYI_assert (15, 10, 0x07);
3974
3975   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3976   for (i = 0; i < (full ? 4 : 2); i++)
3977     aarch64_set_vec_u32 (cpu, vd, i,
3978                          aarch64_get_vec_u32 (cpu, vn, i)
3979                          & aarch64_get_vec_u32 (cpu, vm, i));
3980 }
3981
3982 static void
3983 do_vec_BSL (sim_cpu *cpu)
3984 {
3985   /* instr[31]    = 0
3986      instr[30]    = half (0)/full (1)
3987      instr[29,21] = 101110011
3988      instr[20,16] = Vm
3989      instr[15,10] = 000111
3990      instr[9,5]   = Vn
3991      instr[4.0]   = Vd.  */
3992
3993   unsigned vm = INSTR (20, 16);
3994   unsigned vn = INSTR (9, 5);
3995   unsigned vd = INSTR (4, 0);
3996   unsigned i;
3997   int      full = INSTR (30, 30);
3998
3999   NYI_assert (29, 21, 0x173);
4000   NYI_assert (15, 10, 0x07);
4001
4002   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4003   for (i = 0; i < (full ? 16 : 8); i++)
4004     aarch64_set_vec_u8 (cpu, vd, i,
4005                         (    aarch64_get_vec_u8 (cpu, vd, i)
4006                            & aarch64_get_vec_u8 (cpu, vn, i))
4007                         | ((~ aarch64_get_vec_u8 (cpu, vd, i))
4008                            & aarch64_get_vec_u8 (cpu, vm, i)));
4009 }
4010
4011 static void
4012 do_vec_EOR (sim_cpu *cpu)
4013 {
4014   /* instr[31]    = 0
4015      instr[30]    = half (0)/full (1)
4016      instr[29,21] = 10 1110 001
4017      instr[20,16] = Vm
4018      instr[15,10] = 000111
4019      instr[9,5]   = Vn
4020      instr[4.0]   = Vd.  */
4021
4022   unsigned vm = INSTR (20, 16);
4023   unsigned vn = INSTR (9, 5);
4024   unsigned vd = INSTR (4, 0);
4025   unsigned i;
4026   int      full = INSTR (30, 30);
4027
4028   NYI_assert (29, 21, 0x171);
4029   NYI_assert (15, 10, 0x07);
4030
4031   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4032   for (i = 0; i < (full ? 4 : 2); i++)
4033     aarch64_set_vec_u32 (cpu, vd, i,
4034                          aarch64_get_vec_u32 (cpu, vn, i)
4035                          ^ aarch64_get_vec_u32 (cpu, vm, i));
4036 }
4037
4038 static void
4039 do_vec_bit (sim_cpu *cpu)
4040 {
4041   /* instr[31]    = 0
4042      instr[30]    = half (0)/full (1)
4043      instr[29,23] = 10 1110 1
4044      instr[22]    = BIT (0) / BIF (1)
4045      instr[21]    = 1
4046      instr[20,16] = Vm
4047      instr[15,10] = 0001 11
4048      instr[9,5]   = Vn
4049      instr[4.0]   = Vd.  */
4050
4051   unsigned vm = INSTR (20, 16);
4052   unsigned vn = INSTR (9, 5);
4053   unsigned vd = INSTR (4, 0);
4054   unsigned full = INSTR (30, 30);
4055   unsigned test_false = INSTR (22, 22);
4056   unsigned i;
4057
4058   NYI_assert (29, 23, 0x5D);
4059   NYI_assert (21, 21, 1);
4060   NYI_assert (15, 10, 0x07);
4061
4062   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4063   for (i = 0; i < (full ? 4 : 2); i++)
4064     {
4065       uint32_t vd_val = aarch64_get_vec_u32 (cpu, vd, i);
4066       uint32_t vn_val = aarch64_get_vec_u32 (cpu, vn, i);
4067       uint32_t vm_val = aarch64_get_vec_u32 (cpu, vm, i);
4068       if (test_false)
4069         aarch64_set_vec_u32 (cpu, vd, i,
4070                              (vd_val & vm_val) | (vn_val & ~vm_val));
4071       else
4072         aarch64_set_vec_u32 (cpu, vd, i,
4073                              (vd_val & ~vm_val) | (vn_val & vm_val));
4074     }
4075 }
4076
4077 static void
4078 do_vec_ORN (sim_cpu *cpu)
4079 {
4080   /* instr[31]    = 0
4081      instr[30]    = half (0)/full (1)
4082      instr[29,21] = 00 1110 111
4083      instr[20,16] = Vm
4084      instr[15,10] = 00 0111
4085      instr[9,5]   = Vn
4086      instr[4.0]   = Vd.  */
4087
4088   unsigned vm = INSTR (20, 16);
4089   unsigned vn = INSTR (9, 5);
4090   unsigned vd = INSTR (4, 0);
4091   unsigned i;
4092   int      full = INSTR (30, 30);
4093
4094   NYI_assert (29, 21, 0x077);
4095   NYI_assert (15, 10, 0x07);
4096
4097   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4098   for (i = 0; i < (full ? 16 : 8); i++)
4099     aarch64_set_vec_u8 (cpu, vd, i,
4100                         aarch64_get_vec_u8 (cpu, vn, i)
4101                         | ~ aarch64_get_vec_u8 (cpu, vm, i));
4102 }
4103
4104 static void
4105 do_vec_ORR (sim_cpu *cpu)
4106 {
4107   /* instr[31]    = 0
4108      instr[30]    = half (0)/full (1)
4109      instr[29,21] = 00 1110 101
4110      instr[20,16] = Vm
4111      instr[15,10] = 0001 11
4112      instr[9,5]   = Vn
4113      instr[4.0]   = Vd.  */
4114
4115   unsigned vm = INSTR (20, 16);
4116   unsigned vn = INSTR (9, 5);
4117   unsigned vd = INSTR (4, 0);
4118   unsigned i;
4119   int      full = INSTR (30, 30);
4120
4121   NYI_assert (29, 21, 0x075);
4122   NYI_assert (15, 10, 0x07);
4123
4124   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4125   for (i = 0; i < (full ? 16 : 8); i++)
4126     aarch64_set_vec_u8 (cpu, vd, i,
4127                         aarch64_get_vec_u8 (cpu, vn, i)
4128                         | aarch64_get_vec_u8 (cpu, vm, i));
4129 }
4130
4131 static void
4132 do_vec_BIC (sim_cpu *cpu)
4133 {
4134   /* instr[31]    = 0
4135      instr[30]    = half (0)/full (1)
4136      instr[29,21] = 00 1110 011
4137      instr[20,16] = Vm
4138      instr[15,10] = 00 0111
4139      instr[9,5]   = Vn
4140      instr[4.0]   = Vd.  */
4141
4142   unsigned vm = INSTR (20, 16);
4143   unsigned vn = INSTR (9, 5);
4144   unsigned vd = INSTR (4, 0);
4145   unsigned i;
4146   int      full = INSTR (30, 30);
4147
4148   NYI_assert (29, 21, 0x073);
4149   NYI_assert (15, 10, 0x07);
4150
4151   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4152   for (i = 0; i < (full ? 16 : 8); i++)
4153     aarch64_set_vec_u8 (cpu, vd, i,
4154                         aarch64_get_vec_u8 (cpu, vn, i)
4155                         & ~ aarch64_get_vec_u8 (cpu, vm, i));
4156 }
4157
4158 static void
4159 do_vec_XTN (sim_cpu *cpu)
4160 {
4161   /* instr[31]    = 0
4162      instr[30]    = first part (0)/ second part (1)
4163      instr[29,24] = 00 1110
4164      instr[23,22] = size: byte(00), half(01), word (10)
4165      instr[21,10] = 1000 0100 1010
4166      instr[9,5]   = Vs
4167      instr[4,0]   = Vd.  */
4168
4169   unsigned vs = INSTR (9, 5);
4170   unsigned vd = INSTR (4, 0);
4171   unsigned bias = INSTR (30, 30);
4172   unsigned i;
4173
4174   NYI_assert (29, 24, 0x0E);
4175   NYI_assert (21, 10, 0x84A);
4176
4177   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4178   switch (INSTR (23, 22))
4179     {
4180     case 0:
4181       for (i = 0; i < 8; i++)
4182         aarch64_set_vec_u8 (cpu, vd, i + (bias * 8),
4183                             aarch64_get_vec_u16 (cpu, vs, i));
4184       return;
4185
4186     case 1:
4187       for (i = 0; i < 4; i++)
4188         aarch64_set_vec_u16 (cpu, vd, i + (bias * 4),
4189                              aarch64_get_vec_u32 (cpu, vs, i));
4190       return;
4191
4192     case 2:
4193       for (i = 0; i < 2; i++)
4194         aarch64_set_vec_u32 (cpu, vd, i + (bias * 2),
4195                              aarch64_get_vec_u64 (cpu, vs, i));
4196       return;
4197     }
4198 }
4199
4200 /* Return the number of bits set in the input value.  */
4201 #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
4202 # define popcount __builtin_popcount
4203 #else
4204 static int
4205 popcount (unsigned char x)
4206 {
4207   static const unsigned char popcnt[16] =
4208     {
4209       0, 1, 1, 2,
4210       1, 2, 2, 3,
4211       1, 2, 2, 3,
4212       2, 3, 3, 4
4213     };
4214
4215   /* Only counts the low 8 bits of the input as that is all we need.  */
4216   return popcnt[x % 16] + popcnt[x / 16];
4217 }
4218 #endif
4219
4220 static void
4221 do_vec_CNT (sim_cpu *cpu)
4222 {
4223   /* instr[31]    = 0
4224      instr[30]    = half (0)/ full (1)
4225      instr[29,24] = 00 1110
4226      instr[23,22] = size: byte(00)
4227      instr[21,10] = 1000 0001 0110
4228      instr[9,5]   = Vs
4229      instr[4,0]   = Vd.  */
4230
4231   unsigned vs = INSTR (9, 5);
4232   unsigned vd = INSTR (4, 0);
4233   int full = INSTR (30, 30);
4234   int size = INSTR (23, 22);
4235   int i;
4236
4237   NYI_assert (29, 24, 0x0E);
4238   NYI_assert (21, 10, 0x816);
4239
4240   if (size != 0)
4241     HALT_UNALLOC;
4242
4243   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4244
4245   for (i = 0; i < (full ? 16 : 8); i++)
4246     aarch64_set_vec_u8 (cpu, vd, i,
4247                         popcount (aarch64_get_vec_u8 (cpu, vs, i)));
4248 }
4249
4250 static void
4251 do_vec_maxv (sim_cpu *cpu)
4252 {
4253   /* instr[31]    = 0
4254      instr[30]    = half(0)/full(1)
4255      instr[29]    = signed (0)/unsigned(1)
4256      instr[28,24] = 0 1110
4257      instr[23,22] = size: byte(00), half(01), word (10)
4258      instr[21]    = 1
4259      instr[20,17] = 1 000
4260      instr[16]    = max(0)/min(1)
4261      instr[15,10] = 1010 10
4262      instr[9,5]   = V source
4263      instr[4.0]   = R dest.  */
4264
4265   unsigned vs = INSTR (9, 5);
4266   unsigned rd = INSTR (4, 0);
4267   unsigned full = INSTR (30, 30);
4268   unsigned i;
4269
4270   NYI_assert (28, 24, 0x0E);
4271   NYI_assert (21, 21, 1);
4272   NYI_assert (20, 17, 8);
4273   NYI_assert (15, 10, 0x2A);
4274
4275   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4276   switch ((INSTR (29, 29) << 1) | INSTR (16, 16))
4277     {
4278     case 0: /* SMAXV.  */
4279        {
4280         int64_t smax;
4281         switch (INSTR (23, 22))
4282           {
4283           case 0:
4284             smax = aarch64_get_vec_s8 (cpu, vs, 0);
4285             for (i = 1; i < (full ? 16 : 8); i++)
4286               smax = max (smax, aarch64_get_vec_s8 (cpu, vs, i));
4287             break;
4288           case 1:
4289             smax = aarch64_get_vec_s16 (cpu, vs, 0);
4290             for (i = 1; i < (full ? 8 : 4); i++)
4291               smax = max (smax, aarch64_get_vec_s16 (cpu, vs, i));
4292             break;
4293           case 2:
4294             smax = aarch64_get_vec_s32 (cpu, vs, 0);
4295             for (i = 1; i < (full ? 4 : 2); i++)
4296               smax = max (smax, aarch64_get_vec_s32 (cpu, vs, i));
4297             break;
4298           case 3:
4299             HALT_UNALLOC;
4300           }
4301         aarch64_set_reg_s64 (cpu, rd, NO_SP, smax);
4302         return;
4303       }
4304
4305     case 1: /* SMINV.  */
4306       {
4307         int64_t smin;
4308         switch (INSTR (23, 22))
4309           {
4310           case 0:
4311             smin = aarch64_get_vec_s8 (cpu, vs, 0);
4312             for (i = 1; i < (full ? 16 : 8); i++)
4313               smin = min (smin, aarch64_get_vec_s8 (cpu, vs, i));
4314             break;
4315           case 1:
4316             smin = aarch64_get_vec_s16 (cpu, vs, 0);
4317             for (i = 1; i < (full ? 8 : 4); i++)
4318               smin = min (smin, aarch64_get_vec_s16 (cpu, vs, i));
4319             break;
4320           case 2:
4321             smin = aarch64_get_vec_s32 (cpu, vs, 0);
4322             for (i = 1; i < (full ? 4 : 2); i++)
4323               smin = min (smin, aarch64_get_vec_s32 (cpu, vs, i));
4324             break;
4325
4326           case 3:
4327             HALT_UNALLOC;
4328           }
4329         aarch64_set_reg_s64 (cpu, rd, NO_SP, smin);
4330         return;
4331       }
4332
4333     case 2: /* UMAXV.  */
4334       {
4335         uint64_t umax;
4336         switch (INSTR (23, 22))
4337           {
4338           case 0:
4339             umax = aarch64_get_vec_u8 (cpu, vs, 0);
4340             for (i = 1; i < (full ? 16 : 8); i++)
4341               umax = max (umax, aarch64_get_vec_u8 (cpu, vs, i));
4342             break;
4343           case 1:
4344             umax = aarch64_get_vec_u16 (cpu, vs, 0);
4345             for (i = 1; i < (full ? 8 : 4); i++)
4346               umax = max (umax, aarch64_get_vec_u16 (cpu, vs, i));
4347             break;
4348           case 2:
4349             umax = aarch64_get_vec_u32 (cpu, vs, 0);
4350             for (i = 1; i < (full ? 4 : 2); i++)
4351               umax = max (umax, aarch64_get_vec_u32 (cpu, vs, i));
4352             break;
4353
4354           case 3:
4355             HALT_UNALLOC;
4356           }
4357         aarch64_set_reg_u64 (cpu, rd, NO_SP, umax);
4358         return;
4359       }
4360
4361     case 3: /* UMINV.  */
4362       {
4363         uint64_t umin;
4364         switch (INSTR (23, 22))
4365           {
4366           case 0:
4367             umin = aarch64_get_vec_u8 (cpu, vs, 0);
4368             for (i = 1; i < (full ? 16 : 8); i++)
4369               umin = min (umin, aarch64_get_vec_u8 (cpu, vs, i));
4370             break;
4371           case 1:
4372             umin = aarch64_get_vec_u16 (cpu, vs, 0);
4373             for (i = 1; i < (full ? 8 : 4); i++)
4374               umin = min (umin, aarch64_get_vec_u16 (cpu, vs, i));
4375             break;
4376           case 2:
4377             umin = aarch64_get_vec_u32 (cpu, vs, 0);
4378             for (i = 1; i < (full ? 4 : 2); i++)
4379               umin = min (umin, aarch64_get_vec_u32 (cpu, vs, i));
4380             break;
4381
4382           case 3:
4383             HALT_UNALLOC;
4384           }
4385         aarch64_set_reg_u64 (cpu, rd, NO_SP, umin);
4386         return;
4387       }
4388     }
4389 }
4390
4391 static void
4392 do_vec_fminmaxV (sim_cpu *cpu)
4393 {
4394   /* instr[31,24] = 0110 1110
4395      instr[23]    = max(0)/min(1)
4396      instr[22,14] = 011 0000 11
4397      instr[13,12] = nm(00)/normal(11)
4398      instr[11,10] = 10
4399      instr[9,5]   = V source
4400      instr[4.0]   = R dest.  */
4401
4402   unsigned vs = INSTR (9, 5);
4403   unsigned rd = INSTR (4, 0);
4404   unsigned i;
4405   float res   = aarch64_get_vec_float (cpu, vs, 0);
4406
4407   NYI_assert (31, 24, 0x6E);
4408   NYI_assert (22, 14, 0x0C3);
4409   NYI_assert (11, 10, 2);
4410
4411   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4412   if (INSTR (23, 23))
4413     {
4414       switch (INSTR (13, 12))
4415         {
4416         case 0: /* FMNINNMV.  */
4417           for (i = 1; i < 4; i++)
4418             res = fminnm (res, aarch64_get_vec_float (cpu, vs, i));
4419           break;
4420
4421         case 3: /* FMINV.  */
4422           for (i = 1; i < 4; i++)
4423             res = min (res, aarch64_get_vec_float (cpu, vs, i));
4424           break;
4425
4426         default:
4427           HALT_NYI;
4428         }
4429     }
4430   else
4431     {
4432       switch (INSTR (13, 12))
4433         {
4434         case 0: /* FMNAXNMV.  */
4435           for (i = 1; i < 4; i++)
4436             res = fmaxnm (res, aarch64_get_vec_float (cpu, vs, i));
4437           break;
4438
4439         case 3: /* FMAXV.  */
4440           for (i = 1; i < 4; i++)
4441             res = max (res, aarch64_get_vec_float (cpu, vs, i));
4442           break;
4443
4444         default:
4445           HALT_NYI;
4446         }
4447     }
4448
4449   aarch64_set_FP_float (cpu, rd, res);
4450 }
4451
4452 static void
4453 do_vec_Fminmax (sim_cpu *cpu)
4454 {
4455   /* instr[31]    = 0
4456      instr[30]    = half(0)/full(1)
4457      instr[29,24] = 00 1110
4458      instr[23]    = max(0)/min(1)
4459      instr[22]    = float(0)/double(1)
4460      instr[21]    = 1
4461      instr[20,16] = Vm
4462      instr[15,14] = 11
4463      instr[13,12] = nm(00)/normal(11)
4464      instr[11,10] = 01
4465      instr[9,5]   = Vn
4466      instr[4,0]   = Vd.  */
4467
4468   unsigned vm = INSTR (20, 16);
4469   unsigned vn = INSTR (9, 5);
4470   unsigned vd = INSTR (4, 0);
4471   unsigned full = INSTR (30, 30);
4472   unsigned min = INSTR (23, 23);
4473   unsigned i;
4474
4475   NYI_assert (29, 24, 0x0E);
4476   NYI_assert (21, 21, 1);
4477   NYI_assert (15, 14, 3);
4478   NYI_assert (11, 10, 1);
4479
4480   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4481   if (INSTR (22, 22))
4482     {
4483       double (* func)(double, double);
4484
4485       if (! full)
4486         HALT_NYI;
4487
4488       if (INSTR (13, 12) == 0)
4489         func = min ? dminnm : dmaxnm;
4490       else if (INSTR (13, 12) == 3)
4491         func = min ? fmin : fmax;
4492       else
4493         HALT_NYI;
4494
4495       for (i = 0; i < 2; i++)
4496         aarch64_set_vec_double (cpu, vd, i,
4497                                 func (aarch64_get_vec_double (cpu, vn, i),
4498                                       aarch64_get_vec_double (cpu, vm, i)));
4499     }
4500   else
4501     {
4502       float (* func)(float, float);
4503
4504       if (INSTR (13, 12) == 0)
4505         func = min ? fminnm : fmaxnm;
4506       else if (INSTR (13, 12) == 3)
4507         func = min ? fminf : fmaxf;
4508       else
4509         HALT_NYI;
4510
4511       for (i = 0; i < (full ? 4 : 2); i++)
4512         aarch64_set_vec_float (cpu, vd, i,
4513                                func (aarch64_get_vec_float (cpu, vn, i),
4514                                      aarch64_get_vec_float (cpu, vm, i)));
4515     }
4516 }
4517
4518 static void
4519 do_vec_SCVTF (sim_cpu *cpu)
4520 {
4521   /* instr[31]    = 0
4522      instr[30]    = Q
4523      instr[29,23] = 00 1110 0
4524      instr[22]    = float(0)/double(1)
4525      instr[21,10] = 10 0001 1101 10
4526      instr[9,5]   = Vn
4527      instr[4,0]   = Vd.  */
4528
4529   unsigned vn = INSTR (9, 5);
4530   unsigned vd = INSTR (4, 0);
4531   unsigned full = INSTR (30, 30);
4532   unsigned size = INSTR (22, 22);
4533   unsigned i;
4534
4535   NYI_assert (29, 23, 0x1C);
4536   NYI_assert (21, 10, 0x876);
4537
4538   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4539   if (size)
4540     {
4541       if (! full)
4542         HALT_UNALLOC;
4543
4544       for (i = 0; i < 2; i++)
4545         {
4546           double val = (double) aarch64_get_vec_u64 (cpu, vn, i);
4547           aarch64_set_vec_double (cpu, vd, i, val);
4548         }
4549     }
4550   else
4551     {
4552       for (i = 0; i < (full ? 4 : 2); i++)
4553         {
4554           float val = (float) aarch64_get_vec_u32 (cpu, vn, i);
4555           aarch64_set_vec_float (cpu, vd, i, val);
4556         }
4557     }
4558 }
4559
4560 #define VEC_CMP(SOURCE, CMP)                                            \
4561   do                                                                    \
4562     {                                                                   \
4563       switch (size)                                                     \
4564         {                                                               \
4565         case 0:                                                         \
4566           for (i = 0; i < (full ? 16 : 8); i++)                         \
4567             aarch64_set_vec_u8 (cpu, vd, i,                             \
4568                                 aarch64_get_vec_##SOURCE##8 (cpu, vn, i) \
4569                                 CMP                                     \
4570                                 aarch64_get_vec_##SOURCE##8 (cpu, vm, i) \
4571                                 ? -1 : 0);                              \
4572           return;                                                       \
4573         case 1:                                                         \
4574           for (i = 0; i < (full ? 8 : 4); i++)                          \
4575             aarch64_set_vec_u16 (cpu, vd, i,                            \
4576                                  aarch64_get_vec_##SOURCE##16 (cpu, vn, i) \
4577                                  CMP                                    \
4578                                  aarch64_get_vec_##SOURCE##16 (cpu, vm, i) \
4579                                  ? -1 : 0);                             \
4580           return;                                                       \
4581         case 2:                                                         \
4582           for (i = 0; i < (full ? 4 : 2); i++)                          \
4583             aarch64_set_vec_u32 (cpu, vd, i, \
4584                                  aarch64_get_vec_##SOURCE##32 (cpu, vn, i) \
4585                                  CMP                                    \
4586                                  aarch64_get_vec_##SOURCE##32 (cpu, vm, i) \
4587                                  ? -1 : 0);                             \
4588           return;                                                       \
4589         case 3:                                                         \
4590           if (! full)                                                   \
4591             HALT_UNALLOC;                                               \
4592           for (i = 0; i < 2; i++)                                       \
4593             aarch64_set_vec_u64 (cpu, vd, i, \
4594                                  aarch64_get_vec_##SOURCE##64 (cpu, vn, i) \
4595                                  CMP                                    \
4596                                  aarch64_get_vec_##SOURCE##64 (cpu, vm, i) \
4597                                  ? -1ULL : 0);                          \
4598           return;                                                       \
4599         }                                                               \
4600     }                                                                   \
4601   while (0)
4602
4603 #define VEC_CMP0(SOURCE, CMP)                                           \
4604   do                                                                    \
4605     {                                                                   \
4606       switch (size)                                                     \
4607         {                                                               \
4608         case 0:                                                         \
4609           for (i = 0; i < (full ? 16 : 8); i++)                         \
4610             aarch64_set_vec_u8 (cpu, vd, i,                             \
4611                                 aarch64_get_vec_##SOURCE##8 (cpu, vn, i) \
4612                                 CMP 0 ? -1 : 0);                        \
4613           return;                                                       \
4614         case 1:                                                         \
4615           for (i = 0; i < (full ? 8 : 4); i++)                          \
4616             aarch64_set_vec_u16 (cpu, vd, i,                            \
4617                                  aarch64_get_vec_##SOURCE##16 (cpu, vn, i) \
4618                                  CMP 0 ? -1 : 0);                       \
4619           return;                                                       \
4620         case 2:                                                         \
4621           for (i = 0; i < (full ? 4 : 2); i++)                          \
4622             aarch64_set_vec_u32 (cpu, vd, i,                            \
4623                                  aarch64_get_vec_##SOURCE##32 (cpu, vn, i) \
4624                                  CMP 0 ? -1 : 0);                       \
4625           return;                                                       \
4626         case 3:                                                         \
4627           if (! full)                                                   \
4628             HALT_UNALLOC;                                               \
4629           for (i = 0; i < 2; i++)                                       \
4630             aarch64_set_vec_u64 (cpu, vd, i,                            \
4631                                  aarch64_get_vec_##SOURCE##64 (cpu, vn, i) \
4632                                  CMP 0 ? -1ULL : 0);                    \
4633           return;                                                       \
4634         }                                                               \
4635     }                                                                   \
4636   while (0)
4637
4638 #define VEC_FCMP0(CMP)                                                  \
4639   do                                                                    \
4640     {                                                                   \
4641       if (vm != 0)                                                      \
4642         HALT_NYI;                                                       \
4643       if (INSTR (22, 22))                                               \
4644         {                                                               \
4645           if (! full)                                                   \
4646             HALT_NYI;                                                   \
4647           for (i = 0; i < 2; i++)                                       \
4648             aarch64_set_vec_u64 (cpu, vd, i,                            \
4649                                  aarch64_get_vec_double (cpu, vn, i)    \
4650                                  CMP 0.0 ? -1 : 0);                     \
4651         }                                                               \
4652       else                                                              \
4653         {                                                               \
4654           for (i = 0; i < (full ? 4 : 2); i++)                          \
4655             aarch64_set_vec_u32 (cpu, vd, i,                            \
4656                                  aarch64_get_vec_float (cpu, vn, i)     \
4657                                  CMP 0.0 ? -1 : 0);                     \
4658         }                                                               \
4659       return;                                                           \
4660     }                                                                   \
4661   while (0)
4662
4663 #define VEC_FCMP(CMP)                                                   \
4664   do                                                                    \
4665     {                                                                   \
4666       if (INSTR (22, 22))                                               \
4667         {                                                               \
4668           if (! full)                                                   \
4669             HALT_NYI;                                                   \
4670           for (i = 0; i < 2; i++)                                       \
4671             aarch64_set_vec_u64 (cpu, vd, i,                            \
4672                                  aarch64_get_vec_double (cpu, vn, i)    \
4673                                  CMP                                    \
4674                                  aarch64_get_vec_double (cpu, vm, i)    \
4675                                  ? -1 : 0);                             \
4676         }                                                               \
4677       else                                                              \
4678         {                                                               \
4679           for (i = 0; i < (full ? 4 : 2); i++)                          \
4680             aarch64_set_vec_u32 (cpu, vd, i,                            \
4681                                  aarch64_get_vec_float (cpu, vn, i)     \
4682                                  CMP                                    \
4683                                  aarch64_get_vec_float (cpu, vm, i)     \
4684                                  ? -1 : 0);                             \
4685         }                                                               \
4686       return;                                                           \
4687     }                                                                   \
4688   while (0)
4689
4690 static void
4691 do_vec_compare (sim_cpu *cpu)
4692 {
4693   /* instr[31]    = 0
4694      instr[30]    = half(0)/full(1)
4695      instr[29]    = part-of-comparison-type
4696      instr[28,24] = 0 1110
4697      instr[23,22] = size of integer compares: byte(00), half(01), word (10), long (11)
4698                     type of float compares: single (-0) / double (-1)
4699      instr[21]    = 1
4700      instr[20,16] = Vm or 00000 (compare vs 0)
4701      instr[15,10] = part-of-comparison-type
4702      instr[9,5]   = Vn
4703      instr[4.0]   = Vd.  */
4704
4705   int full = INSTR (30, 30);
4706   int size = INSTR (23, 22);
4707   unsigned vm = INSTR (20, 16);
4708   unsigned vn = INSTR (9, 5);
4709   unsigned vd = INSTR (4, 0);
4710   unsigned i;
4711
4712   NYI_assert (28, 24, 0x0E);
4713   NYI_assert (21, 21, 1);
4714
4715   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4716   if ((INSTR (11, 11)
4717        && INSTR (14, 14))
4718       || ((INSTR (11, 11) == 0
4719            && INSTR (10, 10) == 0)))
4720     {
4721       /* A compare vs 0.  */
4722       if (vm != 0)
4723         {
4724           if (INSTR (15, 10) == 0x2A)
4725             do_vec_maxv (cpu);
4726           else if (INSTR (15, 10) == 0x32
4727                    || INSTR (15, 10) == 0x3E)
4728             do_vec_fminmaxV (cpu);
4729           else if (INSTR (29, 23) == 0x1C
4730                    && INSTR (21, 10) == 0x876)
4731             do_vec_SCVTF (cpu);
4732           else
4733             HALT_NYI;
4734           return;
4735         }
4736     }
4737
4738   if (INSTR (14, 14))
4739     {
4740       /* A floating point compare.  */
4741       unsigned decode = (INSTR (29, 29) << 5) | (INSTR (23, 23) << 4)
4742         | INSTR (13, 10);
4743
4744       NYI_assert (15, 15, 1);
4745
4746       switch (decode)
4747         {
4748         case /* 0b010010: GT#0 */ 0x12: VEC_FCMP0 (>);
4749         case /* 0b110010: GE#0 */ 0x32: VEC_FCMP0 (>=);
4750         case /* 0b010110: EQ#0 */ 0x16: VEC_FCMP0 (==);
4751         case /* 0b110110: LE#0 */ 0x36: VEC_FCMP0 (<=);
4752         case /* 0b011010: LT#0 */ 0x1A: VEC_FCMP0 (<);
4753         case /* 0b111001: GT */   0x39: VEC_FCMP  (>);
4754         case /* 0b101001: GE */   0x29: VEC_FCMP  (>=);
4755         case /* 0b001001: EQ */   0x09: VEC_FCMP  (==);
4756
4757         default:
4758           HALT_NYI;
4759         }
4760     }
4761   else
4762     {
4763       unsigned decode = (INSTR (29, 29) << 6) | INSTR (15, 10);
4764
4765       switch (decode)
4766         {
4767         case 0x0D: /* 0001101 GT */     VEC_CMP  (s, > );
4768         case 0x0F: /* 0001111 GE */     VEC_CMP  (s, >= );
4769         case 0x22: /* 0100010 GT #0 */  VEC_CMP0 (s, > );
4770         case 0x23: /* 0100011 TST */    VEC_CMP  (u, & );
4771         case 0x26: /* 0100110 EQ #0 */  VEC_CMP0 (s, == );
4772         case 0x2A: /* 0101010 LT #0 */  VEC_CMP0 (s, < );
4773         case 0x4D: /* 1001101 HI */     VEC_CMP  (u, > );
4774         case 0x4F: /* 1001111 HS */     VEC_CMP  (u, >= );
4775         case 0x62: /* 1100010 GE #0 */  VEC_CMP0 (s, >= );
4776         case 0x63: /* 1100011 EQ */     VEC_CMP  (u, == );
4777         case 0x66: /* 1100110 LE #0 */  VEC_CMP0 (s, <= );
4778         default:
4779           if (vm == 0)
4780             HALT_NYI;
4781           do_vec_maxv (cpu);
4782         }
4783     }
4784 }
4785
4786 static void
4787 do_vec_SSHL (sim_cpu *cpu)
4788 {
4789   /* instr[31]    = 0
4790      instr[30]    = first part (0)/ second part (1)
4791      instr[29,24] = 00 1110
4792      instr[23,22] = size: byte(00), half(01), word (10), long (11)
4793      instr[21]    = 1
4794      instr[20,16] = Vm
4795      instr[15,10] = 0100 01
4796      instr[9,5]   = Vn
4797      instr[4,0]   = Vd.  */
4798
4799   unsigned full = INSTR (30, 30);
4800   unsigned vm = INSTR (20, 16);
4801   unsigned vn = INSTR (9, 5);
4802   unsigned vd = INSTR (4, 0);
4803   unsigned i;
4804   signed int shift;
4805
4806   NYI_assert (29, 24, 0x0E);
4807   NYI_assert (21, 21, 1);
4808   NYI_assert (15, 10, 0x11);
4809
4810   /* FIXME: What is a signed shift left in this context ?.  */
4811
4812   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4813   switch (INSTR (23, 22))
4814     {
4815     case 0:
4816       for (i = 0; i < (full ? 16 : 8); i++)
4817         {
4818           shift = aarch64_get_vec_s8 (cpu, vm, i);
4819           if (shift >= 0)
4820             aarch64_set_vec_s8 (cpu, vd, i, aarch64_get_vec_s8 (cpu, vn, i)
4821                                 << shift);
4822           else
4823             aarch64_set_vec_s8 (cpu, vd, i, aarch64_get_vec_s8 (cpu, vn, i)
4824                                 >> - shift);
4825         }
4826       return;
4827
4828     case 1:
4829       for (i = 0; i < (full ? 8 : 4); i++)
4830         {
4831           shift = aarch64_get_vec_s8 (cpu, vm, i * 2);
4832           if (shift >= 0)
4833             aarch64_set_vec_s16 (cpu, vd, i, aarch64_get_vec_s16 (cpu, vn, i)
4834                                  << shift);
4835           else
4836             aarch64_set_vec_s16 (cpu, vd, i, aarch64_get_vec_s16 (cpu, vn, i)
4837                                  >> - shift);
4838         }
4839       return;
4840
4841     case 2:
4842       for (i = 0; i < (full ? 4 : 2); i++)
4843         {
4844           shift = aarch64_get_vec_s8 (cpu, vm, i * 4);
4845           if (shift >= 0)
4846             aarch64_set_vec_s32 (cpu, vd, i, aarch64_get_vec_s32 (cpu, vn, i)
4847                                  << shift);
4848           else
4849             aarch64_set_vec_s32 (cpu, vd, i, aarch64_get_vec_s32 (cpu, vn, i)
4850                                  >> - shift);
4851         }
4852       return;
4853
4854     case 3:
4855       if (! full)
4856         HALT_UNALLOC;
4857       for (i = 0; i < 2; i++)
4858         {
4859           shift = aarch64_get_vec_s8 (cpu, vm, i * 8);
4860           if (shift >= 0)
4861             aarch64_set_vec_s64 (cpu, vd, i, aarch64_get_vec_s64 (cpu, vn, i)
4862                                  << shift);
4863           else
4864             aarch64_set_vec_s64 (cpu, vd, i, aarch64_get_vec_s64 (cpu, vn, i)
4865                                  >> - shift);
4866         }
4867       return;
4868     }
4869 }
4870
4871 static void
4872 do_vec_USHL (sim_cpu *cpu)
4873 {
4874   /* instr[31]    = 0
4875      instr[30]    = first part (0)/ second part (1)
4876      instr[29,24] = 10 1110
4877      instr[23,22] = size: byte(00), half(01), word (10), long (11)
4878      instr[21]    = 1
4879      instr[20,16] = Vm
4880      instr[15,10] = 0100 01
4881      instr[9,5]   = Vn
4882      instr[4,0]   = Vd  */
4883
4884   unsigned full = INSTR (30, 30);
4885   unsigned vm = INSTR (20, 16);
4886   unsigned vn = INSTR (9, 5);
4887   unsigned vd = INSTR (4, 0);
4888   unsigned i;
4889   signed int shift;
4890
4891   NYI_assert (29, 24, 0x2E);
4892   NYI_assert (15, 10, 0x11);
4893
4894   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4895   switch (INSTR (23, 22))
4896     {
4897     case 0:
4898         for (i = 0; i < (full ? 16 : 8); i++)
4899           {
4900             shift = aarch64_get_vec_s8 (cpu, vm, i);
4901             if (shift >= 0)
4902               aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
4903                                   << shift);
4904             else
4905               aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
4906                                   >> - shift);
4907           }
4908       return;
4909
4910     case 1:
4911       for (i = 0; i < (full ? 8 : 4); i++)
4912         {
4913           shift = aarch64_get_vec_s8 (cpu, vm, i * 2);
4914           if (shift >= 0)
4915             aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
4916                                  << shift);
4917           else
4918             aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
4919                                  >> - shift);
4920         }
4921       return;
4922
4923     case 2:
4924       for (i = 0; i < (full ? 4 : 2); i++)
4925         {
4926           shift = aarch64_get_vec_s8 (cpu, vm, i * 4);
4927           if (shift >= 0)
4928             aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
4929                                  << shift);
4930           else
4931             aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
4932                                  >> - shift);
4933         }
4934       return;
4935
4936     case 3:
4937       if (! full)
4938         HALT_UNALLOC;
4939       for (i = 0; i < 2; i++)
4940         {
4941           shift = aarch64_get_vec_s8 (cpu, vm, i * 8);
4942           if (shift >= 0)
4943             aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vn, i)
4944                                  << shift);
4945           else
4946             aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vn, i)
4947                                  >> - shift);
4948         }
4949       return;
4950     }
4951 }
4952
4953 static void
4954 do_vec_FMLA (sim_cpu *cpu)
4955 {
4956   /* instr[31]    = 0
4957      instr[30]    = full/half selector
4958      instr[29,23] = 0011100
4959      instr[22]    = size: 0=>float, 1=>double
4960      instr[21]    = 1
4961      instr[20,16] = Vn
4962      instr[15,10] = 1100 11
4963      instr[9,5]   = Vm
4964      instr[4.0]   = Vd.  */
4965
4966   unsigned vm = INSTR (20, 16);
4967   unsigned vn = INSTR (9, 5);
4968   unsigned vd = INSTR (4, 0);
4969   unsigned i;
4970   int      full = INSTR (30, 30);
4971
4972   NYI_assert (29, 23, 0x1C);
4973   NYI_assert (21, 21, 1);
4974   NYI_assert (15, 10, 0x33);
4975
4976   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4977   if (INSTR (22, 22))
4978     {
4979       if (! full)
4980         HALT_UNALLOC;
4981       for (i = 0; i < 2; i++)
4982         aarch64_set_vec_double (cpu, vd, i,
4983                                 aarch64_get_vec_double (cpu, vn, i) *
4984                                 aarch64_get_vec_double (cpu, vm, i) +
4985                                 aarch64_get_vec_double (cpu, vd, i));
4986     }
4987   else
4988     {
4989       for (i = 0; i < (full ? 4 : 2); i++)
4990         aarch64_set_vec_float (cpu, vd, i,
4991                                aarch64_get_vec_float (cpu, vn, i) *
4992                                aarch64_get_vec_float (cpu, vm, i) +
4993                                aarch64_get_vec_float (cpu, vd, i));
4994     }
4995 }
4996
4997 static void
4998 do_vec_max (sim_cpu *cpu)
4999 {
5000   /* instr[31]    = 0
5001      instr[30]    = full/half selector
5002      instr[29]    = SMAX (0) / UMAX (1)
5003      instr[28,24] = 0 1110
5004      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
5005      instr[21]    = 1
5006      instr[20,16] = Vn
5007      instr[15,10] = 0110 01
5008      instr[9,5]   = Vm
5009      instr[4.0]   = Vd.  */
5010
5011   unsigned vm = INSTR (20, 16);
5012   unsigned vn = INSTR (9, 5);
5013   unsigned vd = INSTR (4, 0);
5014   unsigned i;
5015   int      full = INSTR (30, 30);
5016
5017   NYI_assert (28, 24, 0x0E);
5018   NYI_assert (21, 21, 1);
5019   NYI_assert (15, 10, 0x19);
5020
5021   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5022   if (INSTR (29, 29))
5023     {
5024       switch (INSTR (23, 22))
5025         {
5026         case 0:
5027           for (i = 0; i < (full ? 16 : 8); i++)
5028             aarch64_set_vec_u8 (cpu, vd, i,
5029                                 aarch64_get_vec_u8 (cpu, vn, i)
5030                                 > aarch64_get_vec_u8 (cpu, vm, i)
5031                                 ? aarch64_get_vec_u8 (cpu, vn, i)
5032                                 : aarch64_get_vec_u8 (cpu, vm, i));
5033           return;
5034
5035         case 1:
5036           for (i = 0; i < (full ? 8 : 4); i++)
5037             aarch64_set_vec_u16 (cpu, vd, i,
5038                                  aarch64_get_vec_u16 (cpu, vn, i)
5039                                  > aarch64_get_vec_u16 (cpu, vm, i)
5040                                  ? aarch64_get_vec_u16 (cpu, vn, i)
5041                                  : aarch64_get_vec_u16 (cpu, vm, i));
5042           return;
5043
5044         case 2:
5045           for (i = 0; i < (full ? 4 : 2); i++)
5046             aarch64_set_vec_u32 (cpu, vd, i,
5047                                  aarch64_get_vec_u32 (cpu, vn, i)
5048                                  > aarch64_get_vec_u32 (cpu, vm, i)
5049                                  ? aarch64_get_vec_u32 (cpu, vn, i)
5050                                  : aarch64_get_vec_u32 (cpu, vm, i));
5051           return;
5052
5053         case 3:
5054           HALT_UNALLOC;
5055         }
5056     }
5057   else
5058     {
5059       switch (INSTR (23, 22))
5060         {
5061         case 0:
5062           for (i = 0; i < (full ? 16 : 8); i++)
5063             aarch64_set_vec_s8 (cpu, vd, i,
5064                                 aarch64_get_vec_s8 (cpu, vn, i)
5065                                 > aarch64_get_vec_s8 (cpu, vm, i)
5066                                 ? aarch64_get_vec_s8 (cpu, vn, i)
5067                                 : aarch64_get_vec_s8 (cpu, vm, i));
5068           return;
5069
5070         case 1:
5071           for (i = 0; i < (full ? 8 : 4); i++)
5072             aarch64_set_vec_s16 (cpu, vd, i,
5073                                  aarch64_get_vec_s16 (cpu, vn, i)
5074                                  > aarch64_get_vec_s16 (cpu, vm, i)
5075                                  ? aarch64_get_vec_s16 (cpu, vn, i)
5076                                  : aarch64_get_vec_s16 (cpu, vm, i));
5077           return;
5078
5079         case 2:
5080           for (i = 0; i < (full ? 4 : 2); i++)
5081             aarch64_set_vec_s32 (cpu, vd, i,
5082                                  aarch64_get_vec_s32 (cpu, vn, i)
5083                                  > aarch64_get_vec_s32 (cpu, vm, i)
5084                                  ? aarch64_get_vec_s32 (cpu, vn, i)
5085                                  : aarch64_get_vec_s32 (cpu, vm, i));
5086           return;
5087
5088         case 3:
5089           HALT_UNALLOC;
5090         }
5091     }
5092 }
5093
5094 static void
5095 do_vec_min (sim_cpu *cpu)
5096 {
5097   /* instr[31]    = 0
5098      instr[30]    = full/half selector
5099      instr[29]    = SMIN (0) / UMIN (1)
5100      instr[28,24] = 0 1110
5101      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
5102      instr[21]    = 1
5103      instr[20,16] = Vn
5104      instr[15,10] = 0110 11
5105      instr[9,5]   = Vm
5106      instr[4.0]   = Vd.  */
5107
5108   unsigned vm = INSTR (20, 16);
5109   unsigned vn = INSTR (9, 5);
5110   unsigned vd = INSTR (4, 0);
5111   unsigned i;
5112   int      full = INSTR (30, 30);
5113
5114   NYI_assert (28, 24, 0x0E);
5115   NYI_assert (21, 21, 1);
5116   NYI_assert (15, 10, 0x1B);
5117
5118   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5119   if (INSTR (29, 29))
5120     {
5121       switch (INSTR (23, 22))
5122         {
5123         case 0:
5124           for (i = 0; i < (full ? 16 : 8); i++)
5125             aarch64_set_vec_u8 (cpu, vd, i,
5126                                 aarch64_get_vec_u8 (cpu, vn, i)
5127                                 < aarch64_get_vec_u8 (cpu, vm, i)
5128                                 ? aarch64_get_vec_u8 (cpu, vn, i)
5129                                 : aarch64_get_vec_u8 (cpu, vm, i));
5130           return;
5131
5132         case 1:
5133           for (i = 0; i < (full ? 8 : 4); i++)
5134             aarch64_set_vec_u16 (cpu, vd, i,
5135                                  aarch64_get_vec_u16 (cpu, vn, i)
5136                                  < aarch64_get_vec_u16 (cpu, vm, i)
5137                                  ? aarch64_get_vec_u16 (cpu, vn, i)
5138                                  : aarch64_get_vec_u16 (cpu, vm, i));
5139           return;
5140
5141         case 2:
5142           for (i = 0; i < (full ? 4 : 2); i++)
5143             aarch64_set_vec_u32 (cpu, vd, i,
5144                                  aarch64_get_vec_u32 (cpu, vn, i)
5145                                  < aarch64_get_vec_u32 (cpu, vm, i)
5146                                  ? aarch64_get_vec_u32 (cpu, vn, i)
5147                                  : aarch64_get_vec_u32 (cpu, vm, i));
5148           return;
5149
5150         case 3:
5151           HALT_UNALLOC;
5152         }
5153     }
5154   else
5155     {
5156       switch (INSTR (23, 22))
5157         {
5158         case 0:
5159           for (i = 0; i < (full ? 16 : 8); i++)
5160             aarch64_set_vec_s8 (cpu, vd, i,
5161                                 aarch64_get_vec_s8 (cpu, vn, i)
5162                                 < aarch64_get_vec_s8 (cpu, vm, i)
5163                                 ? aarch64_get_vec_s8 (cpu, vn, i)
5164                                 : aarch64_get_vec_s8 (cpu, vm, i));
5165           return;
5166
5167         case 1:
5168           for (i = 0; i < (full ? 8 : 4); i++)
5169             aarch64_set_vec_s16 (cpu, vd, i,
5170                                  aarch64_get_vec_s16 (cpu, vn, i)
5171                                  < aarch64_get_vec_s16 (cpu, vm, i)
5172                                  ? aarch64_get_vec_s16 (cpu, vn, i)
5173                                  : aarch64_get_vec_s16 (cpu, vm, i));
5174           return;
5175
5176         case 2:
5177           for (i = 0; i < (full ? 4 : 2); i++)
5178             aarch64_set_vec_s32 (cpu, vd, i,
5179                                  aarch64_get_vec_s32 (cpu, vn, i)
5180                                  < aarch64_get_vec_s32 (cpu, vm, i)
5181                                  ? aarch64_get_vec_s32 (cpu, vn, i)
5182                                  : aarch64_get_vec_s32 (cpu, vm, i));
5183           return;
5184
5185         case 3:
5186           HALT_UNALLOC;
5187         }
5188     }
5189 }
5190
5191 static void
5192 do_vec_sub_long (sim_cpu *cpu)
5193 {
5194   /* instr[31]    = 0
5195      instr[30]    = lower (0) / upper (1)
5196      instr[29]    = signed (0) / unsigned (1)
5197      instr[28,24] = 0 1110
5198      instr[23,22] = size: bytes (00), half (01), word (10)
5199      instr[21]    = 1
5200      insrt[20,16] = Vm
5201      instr[15,10] = 0010 00
5202      instr[9,5]   = Vn
5203      instr[4,0]   = V dest.  */
5204
5205   unsigned size = INSTR (23, 22);
5206   unsigned vm = INSTR (20, 16);
5207   unsigned vn = INSTR (9, 5);
5208   unsigned vd = INSTR (4, 0);
5209   unsigned bias = 0;
5210   unsigned i;
5211
5212   NYI_assert (28, 24, 0x0E);
5213   NYI_assert (21, 21, 1);
5214   NYI_assert (15, 10, 0x08);
5215
5216   if (size == 3)
5217     HALT_UNALLOC;
5218
5219   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5220   switch (INSTR (30, 29))
5221     {
5222     case 2: /* SSUBL2.  */
5223       bias = 2;
5224     case 0: /* SSUBL.  */
5225       switch (size)
5226         {
5227         case 0:
5228           bias *= 3;
5229           for (i = 0; i < 8; i++)
5230             aarch64_set_vec_s16 (cpu, vd, i,
5231                                  aarch64_get_vec_s8 (cpu, vn, i + bias)
5232                                  - aarch64_get_vec_s8 (cpu, vm, i + bias));
5233           break;
5234
5235         case 1:
5236           bias *= 2;
5237           for (i = 0; i < 4; i++)
5238             aarch64_set_vec_s32 (cpu, vd, i,
5239                                  aarch64_get_vec_s16 (cpu, vn, i + bias)
5240                                  - aarch64_get_vec_s16 (cpu, vm, i + bias));
5241           break;
5242
5243         case 2:
5244           for (i = 0; i < 2; i++)
5245             aarch64_set_vec_s64 (cpu, vd, i,
5246                                  aarch64_get_vec_s32 (cpu, vn, i + bias)
5247                                  - aarch64_get_vec_s32 (cpu, vm, i + bias));
5248           break;
5249
5250         default:
5251           HALT_UNALLOC;
5252         }
5253       break;
5254
5255     case 3: /* USUBL2.  */
5256       bias = 2;
5257     case 1: /* USUBL.  */
5258       switch (size)
5259         {
5260         case 0:
5261           bias *= 3;
5262           for (i = 0; i < 8; i++)
5263             aarch64_set_vec_u16 (cpu, vd, i,
5264                                  aarch64_get_vec_u8 (cpu, vn, i + bias)
5265                                  - aarch64_get_vec_u8 (cpu, vm, i + bias));
5266           break;
5267
5268         case 1:
5269           bias *= 2;
5270           for (i = 0; i < 4; i++)
5271             aarch64_set_vec_u32 (cpu, vd, i,
5272                                  aarch64_get_vec_u16 (cpu, vn, i + bias)
5273                                  - aarch64_get_vec_u16 (cpu, vm, i + bias));
5274           break;
5275
5276         case 2:
5277           for (i = 0; i < 2; i++)
5278             aarch64_set_vec_u64 (cpu, vd, i,
5279                                  aarch64_get_vec_u32 (cpu, vn, i + bias)
5280                                  - aarch64_get_vec_u32 (cpu, vm, i + bias));
5281           break;
5282
5283         default:
5284           HALT_UNALLOC;
5285         }
5286       break;
5287     }
5288 }
5289
5290 static void
5291 do_vec_ADDP (sim_cpu *cpu)
5292 {
5293   /* instr[31]    = 0
5294      instr[30]    = half(0)/full(1)
5295      instr[29,24] = 00 1110
5296      instr[23,22] = size: bytes (00), half (01), word (10), long (11)
5297      instr[21]    = 1
5298      insrt[20,16] = Vm
5299      instr[15,10] = 1011 11
5300      instr[9,5]   = Vn
5301      instr[4,0]   = V dest.  */
5302
5303   FRegister copy_vn;
5304   FRegister copy_vm;
5305   unsigned full = INSTR (30, 30);
5306   unsigned size = INSTR (23, 22);
5307   unsigned vm = INSTR (20, 16);
5308   unsigned vn = INSTR (9, 5);
5309   unsigned vd = INSTR (4, 0);
5310   unsigned i, range;
5311
5312   NYI_assert (29, 24, 0x0E);
5313   NYI_assert (21, 21, 1);
5314   NYI_assert (15, 10, 0x2F);
5315
5316   /* Make copies of the source registers in case vd == vn/vm.  */
5317   copy_vn = cpu->fr[vn];
5318   copy_vm = cpu->fr[vm];
5319
5320   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5321   switch (size)
5322     {
5323     case 0:
5324       range = full ? 8 : 4;
5325       for (i = 0; i < range; i++)
5326         {
5327           aarch64_set_vec_u8 (cpu, vd, i,
5328                               copy_vn.b[i * 2] + copy_vn.b[i * 2 + 1]);
5329           aarch64_set_vec_u8 (cpu, vd, i + range,
5330                               copy_vm.b[i * 2] + copy_vm.b[i * 2 + 1]);
5331         }
5332       return;
5333
5334     case 1:
5335       range = full ? 4 : 2;
5336       for (i = 0; i < range; i++)
5337         {
5338           aarch64_set_vec_u16 (cpu, vd, i,
5339                                copy_vn.h[i * 2] + copy_vn.h[i * 2 + 1]);
5340           aarch64_set_vec_u16 (cpu, vd, i + range,
5341                                copy_vm.h[i * 2] + copy_vm.h[i * 2 + 1]);
5342         }
5343       return;
5344
5345     case 2:
5346       range = full ? 2 : 1;
5347       for (i = 0; i < range; i++)
5348         {
5349           aarch64_set_vec_u32 (cpu, vd, i,
5350                                copy_vn.w[i * 2] + copy_vn.w[i * 2 + 1]);
5351           aarch64_set_vec_u32 (cpu, vd, i + range,
5352                                copy_vm.w[i * 2] + copy_vm.w[i * 2 + 1]);
5353         }
5354       return;
5355
5356     case 3:
5357       if (! full)
5358         HALT_UNALLOC;
5359       aarch64_set_vec_u64 (cpu, vd, 0, copy_vn.v[0] + copy_vn.v[1]);
5360       aarch64_set_vec_u64 (cpu, vd, 1, copy_vm.v[0] + copy_vm.v[1]);
5361       return;
5362     }
5363 }
5364
5365 static void
5366 do_vec_UMOV (sim_cpu *cpu)
5367 {
5368   /* instr[31]    = 0
5369      instr[30]    = 32-bit(0)/64-bit(1)
5370      instr[29,21] = 00 1110 000
5371      insrt[20,16] = size & index
5372      instr[15,10] = 0011 11
5373      instr[9,5]   = V source
5374      instr[4,0]   = R dest.  */
5375
5376   unsigned vs = INSTR (9, 5);
5377   unsigned rd = INSTR (4, 0);
5378   unsigned index;
5379
5380   NYI_assert (29, 21, 0x070);
5381   NYI_assert (15, 10, 0x0F);
5382
5383   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5384   if (INSTR (16, 16))
5385     {
5386       /* Byte transfer.  */
5387       index = INSTR (20, 17);
5388       aarch64_set_reg_u64 (cpu, rd, NO_SP,
5389                            aarch64_get_vec_u8 (cpu, vs, index));
5390     }
5391   else if (INSTR (17, 17))
5392     {
5393       index = INSTR (20, 18);
5394       aarch64_set_reg_u64 (cpu, rd, NO_SP,
5395                            aarch64_get_vec_u16 (cpu, vs, index));
5396     }
5397   else if (INSTR (18, 18))
5398     {
5399       index = INSTR (20, 19);
5400       aarch64_set_reg_u64 (cpu, rd, NO_SP,
5401                            aarch64_get_vec_u32 (cpu, vs, index));
5402     }
5403   else
5404     {
5405       if (INSTR (30, 30) != 1)
5406         HALT_UNALLOC;
5407
5408       index = INSTR (20, 20);
5409       aarch64_set_reg_u64 (cpu, rd, NO_SP,
5410                            aarch64_get_vec_u64 (cpu, vs, index));
5411     }
5412 }
5413
5414 static void
5415 do_vec_FABS (sim_cpu *cpu)
5416 {
5417   /* instr[31]    = 0
5418      instr[30]    = half(0)/full(1)
5419      instr[29,23] = 00 1110 1
5420      instr[22]    = float(0)/double(1)
5421      instr[21,16] = 10 0000
5422      instr[15,10] = 1111 10
5423      instr[9,5]   = Vn
5424      instr[4,0]   = Vd.  */
5425
5426   unsigned vn = INSTR (9, 5);
5427   unsigned vd = INSTR (4, 0);
5428   unsigned full = INSTR (30, 30);
5429   unsigned i;
5430
5431   NYI_assert (29, 23, 0x1D);
5432   NYI_assert (21, 10, 0x83E);
5433
5434   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5435   if (INSTR (22, 22))
5436     {
5437       if (! full)
5438         HALT_NYI;
5439
5440       for (i = 0; i < 2; i++)
5441         aarch64_set_vec_double (cpu, vd, i,
5442                                 fabs (aarch64_get_vec_double (cpu, vn, i)));
5443     }
5444   else
5445     {
5446       for (i = 0; i < (full ? 4 : 2); i++)
5447         aarch64_set_vec_float (cpu, vd, i,
5448                                fabsf (aarch64_get_vec_float (cpu, vn, i)));
5449     }
5450 }
5451
5452 static void
5453 do_vec_FCVTZS (sim_cpu *cpu)
5454 {
5455   /* instr[31]    = 0
5456      instr[30]    = half (0) / all (1)
5457      instr[29,23] = 00 1110 1
5458      instr[22]    = single (0) / double (1)
5459      instr[21,10] = 10 0001 1011 10
5460      instr[9,5]   = Rn
5461      instr[4,0]   = Rd.  */
5462
5463   unsigned rn = INSTR (9, 5);
5464   unsigned rd = INSTR (4, 0);
5465   unsigned full = INSTR (30, 30);
5466   unsigned i;
5467
5468   NYI_assert (31, 31, 0);
5469   NYI_assert (29, 23, 0x1D);
5470   NYI_assert (21, 10, 0x86E);
5471
5472   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5473   if (INSTR (22, 22))
5474     {
5475       if (! full)
5476         HALT_UNALLOC;
5477
5478       for (i = 0; i < 2; i++)
5479         aarch64_set_vec_s64 (cpu, rd, i,
5480                              (int64_t) aarch64_get_vec_double (cpu, rn, i));
5481     }
5482   else
5483     for (i = 0; i < (full ? 4 : 2); i++)
5484       aarch64_set_vec_s32 (cpu, rd, i,
5485                            (int32_t) aarch64_get_vec_float (cpu, rn, i));
5486 }
5487
5488 static void
5489 do_vec_REV64 (sim_cpu *cpu)
5490 {
5491   /* instr[31]    = 0
5492      instr[30]    = full/half
5493      instr[29,24] = 00 1110
5494      instr[23,22] = size
5495      instr[21,10] = 10 0000 0000 10
5496      instr[9,5]   = Rn
5497      instr[4,0]   = Rd.  */
5498
5499   unsigned rn = INSTR (9, 5);
5500   unsigned rd = INSTR (4, 0);
5501   unsigned size = INSTR (23, 22);
5502   unsigned full = INSTR (30, 30);
5503   unsigned i;
5504   FRegister val;
5505
5506   NYI_assert (29, 24, 0x0E);
5507   NYI_assert (21, 10, 0x802);
5508
5509   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5510   switch (size)
5511     {
5512     case 0:
5513       for (i = 0; i < (full ? 16 : 8); i++)
5514         val.b[i ^ 0x7] = aarch64_get_vec_u8 (cpu, rn, i);
5515       break;
5516
5517     case 1:
5518       for (i = 0; i < (full ? 8 : 4); i++)
5519         val.h[i ^ 0x3] = aarch64_get_vec_u16 (cpu, rn, i);
5520       break;
5521
5522     case 2:
5523       for (i = 0; i < (full ? 4 : 2); i++)
5524         val.w[i ^ 0x1] = aarch64_get_vec_u32 (cpu, rn, i);
5525       break;
5526
5527     case 3:
5528       HALT_UNALLOC;
5529     }
5530
5531   aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
5532   if (full)
5533     aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
5534 }
5535
5536 static void
5537 do_vec_REV16 (sim_cpu *cpu)
5538 {
5539   /* instr[31]    = 0
5540      instr[30]    = full/half
5541      instr[29,24] = 00 1110
5542      instr[23,22] = size
5543      instr[21,10] = 10 0000 0001 10
5544      instr[9,5]   = Rn
5545      instr[4,0]   = Rd.  */
5546
5547   unsigned rn = INSTR (9, 5);
5548   unsigned rd = INSTR (4, 0);
5549   unsigned size = INSTR (23, 22);
5550   unsigned full = INSTR (30, 30);
5551   unsigned i;
5552   FRegister val;
5553
5554   NYI_assert (29, 24, 0x0E);
5555   NYI_assert (21, 10, 0x806);
5556
5557   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5558   switch (size)
5559     {
5560     case 0:
5561       for (i = 0; i < (full ? 16 : 8); i++)
5562         val.b[i ^ 0x1] = aarch64_get_vec_u8 (cpu, rn, i);
5563       break;
5564
5565     default:
5566       HALT_UNALLOC;
5567     }
5568
5569   aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
5570   if (full)
5571     aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
5572 }
5573
5574 static void
5575 do_vec_op1 (sim_cpu *cpu)
5576 {
5577   /* instr[31]    = 0
5578      instr[30]    = half/full
5579      instr[29,24] = 00 1110
5580      instr[23,21] = ???
5581      instr[20,16] = Vm
5582      instr[15,10] = sub-opcode
5583      instr[9,5]   = Vn
5584      instr[4,0]   = Vd  */
5585   NYI_assert (29, 24, 0x0E);
5586
5587   if (INSTR (21, 21) == 0)
5588     {
5589       if (INSTR (23, 22) == 0)
5590         {
5591           if (INSTR (30, 30) == 1
5592               && INSTR (17, 14) == 0
5593               && INSTR (12, 10) == 7)
5594             return do_vec_ins_2 (cpu);
5595
5596           switch (INSTR (15, 10))
5597             {
5598             case 0x01: do_vec_DUP_vector_into_vector (cpu); return;
5599             case 0x03: do_vec_DUP_scalar_into_vector (cpu); return;
5600             case 0x07: do_vec_INS (cpu); return;
5601             case 0x0A: do_vec_TRN (cpu); return;
5602
5603             case 0x0F:
5604               if (INSTR (17, 16) == 0)
5605                 {
5606                   do_vec_MOV_into_scalar (cpu);
5607                   return;
5608                 }
5609               break;
5610
5611             case 0x00:
5612             case 0x08:
5613             case 0x10:
5614             case 0x18:
5615               do_vec_TBL (cpu); return;
5616
5617             case 0x06:
5618             case 0x16:
5619               do_vec_UZP (cpu); return;
5620
5621             case 0x0E:
5622             case 0x1E:
5623               do_vec_ZIP (cpu); return;
5624
5625             default:
5626               HALT_NYI;
5627             }
5628         }
5629
5630       switch (INSTR (13, 10))
5631         {
5632         case 0x6: do_vec_UZP (cpu); return;
5633         case 0xE: do_vec_ZIP (cpu); return;
5634         case 0xA: do_vec_TRN (cpu); return;
5635         case 0xF: do_vec_UMOV (cpu); return;
5636         default:  HALT_NYI;
5637         }
5638     }
5639
5640   switch (INSTR (15, 10))
5641     {
5642     case 0x02: do_vec_REV64 (cpu); return;
5643     case 0x06: do_vec_REV16 (cpu); return;
5644
5645     case 0x07:
5646       switch (INSTR (23, 21))
5647         {
5648         case 1: do_vec_AND (cpu); return;
5649         case 3: do_vec_BIC (cpu); return;
5650         case 5: do_vec_ORR (cpu); return;
5651         case 7: do_vec_ORN (cpu); return;
5652         default: HALT_NYI;
5653         }
5654
5655     case 0x08: do_vec_sub_long (cpu); return;
5656     case 0x0a: do_vec_XTN (cpu); return;
5657     case 0x11: do_vec_SSHL (cpu); return;
5658     case 0x16: do_vec_CNT (cpu); return;
5659     case 0x19: do_vec_max (cpu); return;
5660     case 0x1B: do_vec_min (cpu); return;
5661     case 0x21: do_vec_add (cpu); return;
5662     case 0x25: do_vec_MLA (cpu); return;
5663     case 0x27: do_vec_mul (cpu); return;
5664     case 0x2F: do_vec_ADDP (cpu); return;
5665     case 0x30: do_vec_mull (cpu); return;
5666     case 0x33: do_vec_FMLA (cpu); return;
5667     case 0x35: do_vec_fadd (cpu); return;
5668
5669     case 0x2E:
5670       switch (INSTR (20, 16))
5671         {
5672         case 0x00: do_vec_ABS (cpu); return;
5673         case 0x01: do_vec_FCVTZS (cpu); return;
5674         case 0x11: do_vec_ADDV (cpu); return;
5675         default: HALT_NYI;
5676         }
5677
5678     case 0x31:
5679     case 0x3B:
5680       do_vec_Fminmax (cpu); return;
5681
5682     case 0x0D:
5683     case 0x0F:
5684     case 0x22:
5685     case 0x23:
5686     case 0x26:
5687     case 0x2A:
5688     case 0x32:
5689     case 0x36:
5690     case 0x39:
5691     case 0x3A:
5692       do_vec_compare (cpu); return;
5693
5694     case 0x3E:
5695       do_vec_FABS (cpu); return;
5696
5697     default:
5698       HALT_NYI;
5699     }
5700 }
5701
5702 static void
5703 do_vec_xtl (sim_cpu *cpu)
5704 {
5705   /* instr[31]    = 0
5706      instr[30,29] = SXTL (00), UXTL (01), SXTL2 (10), UXTL2 (11)
5707      instr[28,22] = 0 1111 00
5708      instr[21,16] = size & shift (USHLL, SSHLL, USHLL2, SSHLL2)
5709      instr[15,10] = 1010 01
5710      instr[9,5]   = V source
5711      instr[4,0]   = V dest.  */
5712
5713   unsigned vs = INSTR (9, 5);
5714   unsigned vd = INSTR (4, 0);
5715   unsigned i, shift, bias = 0;
5716
5717   NYI_assert (28, 22, 0x3C);
5718   NYI_assert (15, 10, 0x29);
5719
5720   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5721   switch (INSTR (30, 29))
5722     {
5723     case 2: /* SXTL2, SSHLL2.  */
5724       bias = 2;
5725     case 0: /* SXTL, SSHLL.  */
5726       if (INSTR (21, 21))
5727         {
5728           int64_t val1, val2;
5729
5730           shift = INSTR (20, 16);
5731           /* Get the source values before setting the destination values
5732              in case the source and destination are the same.  */
5733           val1 = aarch64_get_vec_s32 (cpu, vs, bias) << shift;
5734           val2 = aarch64_get_vec_s32 (cpu, vs, bias + 1) << shift;
5735           aarch64_set_vec_s64 (cpu, vd, 0, val1);
5736           aarch64_set_vec_s64 (cpu, vd, 1, val2);
5737         }
5738       else if (INSTR (20, 20))
5739         {
5740           int32_t v[4];
5741           int32_t v1,v2,v3,v4;
5742
5743           shift = INSTR (19, 16);
5744           bias *= 2;
5745           for (i = 0; i < 4; i++)
5746             v[i] = aarch64_get_vec_s16 (cpu, vs, bias + i) << shift;
5747           for (i = 0; i < 4; i++)
5748             aarch64_set_vec_s32 (cpu, vd, i, v[i]);
5749         }
5750       else
5751         {
5752           int16_t v[8];
5753           NYI_assert (19, 19, 1);
5754
5755           shift = INSTR (18, 16);
5756           bias *= 4;
5757           for (i = 0; i < 8; i++)
5758             v[i] = aarch64_get_vec_s8 (cpu, vs, i + bias) << shift;
5759           for (i = 0; i < 8; i++)
5760             aarch64_set_vec_s16 (cpu, vd, i, v[i]);
5761         }
5762       return;
5763
5764     case 3: /* UXTL2, USHLL2.  */
5765       bias = 2;
5766     case 1: /* UXTL, USHLL.  */
5767       if (INSTR (21, 21))
5768         {
5769           uint64_t v1, v2;
5770           shift = INSTR (20, 16);
5771           v1 = aarch64_get_vec_u32 (cpu, vs, bias) << shift;
5772           v2 = aarch64_get_vec_u32 (cpu, vs, bias + 1) << shift;
5773           aarch64_set_vec_u64 (cpu, vd, 0, v1);
5774           aarch64_set_vec_u64 (cpu, vd, 1, v2);
5775         }
5776       else if (INSTR (20, 20))
5777         {
5778           uint32_t v[4];
5779           shift = INSTR (19, 16);
5780           bias *= 2;
5781           for (i = 0; i < 4; i++)
5782             v[i] = aarch64_get_vec_u16 (cpu, vs, i + bias) << shift;
5783           for (i = 0; i < 4; i++)
5784             aarch64_set_vec_u32 (cpu, vd, i, v[i]);
5785         }
5786       else
5787         {
5788           uint16_t v[8];
5789           NYI_assert (19, 19, 1);
5790
5791           shift = INSTR (18, 16);
5792           bias *= 4;
5793           for (i = 0; i < 8; i++)
5794             v[i] = aarch64_get_vec_u8 (cpu, vs, i + bias) << shift;
5795           for (i = 0; i < 8; i++)
5796             aarch64_set_vec_u16 (cpu, vd, i, v[i]);
5797         }
5798       return;
5799     }
5800 }
5801
5802 static void
5803 do_vec_SHL (sim_cpu *cpu)
5804 {
5805   /* instr [31]    = 0
5806      instr [30]    = half(0)/full(1)
5807      instr [29,23] = 001 1110
5808      instr [22,16] = size and shift amount
5809      instr [15,10] = 01 0101
5810      instr [9, 5]  = Vs
5811      instr [4, 0]  = Vd.  */
5812
5813   int shift;
5814   int full    = INSTR (30, 30);
5815   unsigned vs = INSTR (9, 5);
5816   unsigned vd = INSTR (4, 0);
5817   unsigned i;
5818
5819   NYI_assert (29, 23, 0x1E);
5820   NYI_assert (15, 10, 0x15);
5821
5822   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5823   if (INSTR (22, 22))
5824     {
5825       shift = INSTR (21, 16);
5826
5827       if (full == 0)
5828         HALT_UNALLOC;
5829
5830       for (i = 0; i < 2; i++)
5831         {
5832           uint64_t val = aarch64_get_vec_u64 (cpu, vs, i);
5833           aarch64_set_vec_u64 (cpu, vd, i, val << shift);
5834         }
5835
5836       return;
5837     }
5838
5839   if (INSTR (21, 21))
5840     {
5841       shift = INSTR (20, 16);
5842
5843       for (i = 0; i < (full ? 4 : 2); i++)
5844         {
5845           uint32_t val = aarch64_get_vec_u32 (cpu, vs, i);
5846           aarch64_set_vec_u32 (cpu, vd, i, val << shift);
5847         }
5848
5849       return;
5850     }
5851
5852   if (INSTR (20, 20))
5853     {
5854       shift = INSTR (19, 16);
5855
5856       for (i = 0; i < (full ? 8 : 4); i++)
5857         {
5858           uint16_t val = aarch64_get_vec_u16 (cpu, vs, i);
5859           aarch64_set_vec_u16 (cpu, vd, i, val << shift);
5860         }
5861
5862       return;
5863     }
5864
5865   if (INSTR (19, 19) == 0)
5866     HALT_UNALLOC;
5867
5868   shift = INSTR (18, 16);
5869
5870   for (i = 0; i < (full ? 16 : 8); i++)
5871     {
5872       uint8_t val = aarch64_get_vec_u8 (cpu, vs, i);
5873       aarch64_set_vec_u8 (cpu, vd, i, val << shift);
5874     }
5875 }
5876
5877 static void
5878 do_vec_SSHR_USHR (sim_cpu *cpu)
5879 {
5880   /* instr [31]    = 0
5881      instr [30]    = half(0)/full(1)
5882      instr [29]    = signed(0)/unsigned(1)
5883      instr [28,23] = 0 1111 0
5884      instr [22,16] = size and shift amount
5885      instr [15,10] = 0000 01
5886      instr [9, 5]  = Vs
5887      instr [4, 0]  = Vd.  */
5888
5889   int full       = INSTR (30, 30);
5890   int sign       = ! INSTR (29, 29);
5891   unsigned shift = INSTR (22, 16);
5892   unsigned vs    = INSTR (9, 5);
5893   unsigned vd    = INSTR (4, 0);
5894   unsigned i;
5895
5896   NYI_assert (28, 23, 0x1E);
5897   NYI_assert (15, 10, 0x01);
5898
5899   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5900   if (INSTR (22, 22))
5901     {
5902       shift = 128 - shift;
5903
5904       if (full == 0)
5905         HALT_UNALLOC;
5906
5907       if (sign)
5908         for (i = 0; i < 2; i++)
5909           {
5910             int64_t val = aarch64_get_vec_s64 (cpu, vs, i);
5911             aarch64_set_vec_s64 (cpu, vd, i, val >> shift);
5912           }
5913       else
5914         for (i = 0; i < 2; i++)
5915           {
5916             uint64_t val = aarch64_get_vec_u64 (cpu, vs, i);
5917             aarch64_set_vec_u64 (cpu, vd, i, val >> shift);
5918           }
5919
5920       return;
5921     }
5922
5923   if (INSTR (21, 21))
5924     {
5925       shift = 64 - shift;
5926
5927       if (sign)
5928         for (i = 0; i < (full ? 4 : 2); i++)
5929           {
5930             int32_t val = aarch64_get_vec_s32 (cpu, vs, i);
5931             aarch64_set_vec_s32 (cpu, vd, i, val >> shift);
5932           }
5933       else
5934         for (i = 0; i < (full ? 4 : 2); i++)
5935           {
5936             uint32_t val = aarch64_get_vec_u32 (cpu, vs, i);
5937             aarch64_set_vec_u32 (cpu, vd, i, val >> shift);
5938           }
5939
5940       return;
5941     }
5942
5943   if (INSTR (20, 20))
5944     {
5945       shift = 32 - shift;
5946
5947       if (sign)
5948         for (i = 0; i < (full ? 8 : 4); i++)
5949           {
5950             int16_t val = aarch64_get_vec_s16 (cpu, vs, i);
5951             aarch64_set_vec_s16 (cpu, vd, i, val >> shift);
5952           }
5953       else
5954         for (i = 0; i < (full ? 8 : 4); i++)
5955           {
5956             uint16_t val = aarch64_get_vec_u16 (cpu, vs, i);
5957             aarch64_set_vec_u16 (cpu, vd, i, val >> shift);
5958           }
5959
5960       return;
5961     }
5962
5963   if (INSTR (19, 19) == 0)
5964     HALT_UNALLOC;
5965
5966   shift = 16 - shift;
5967
5968   if (sign)
5969     for (i = 0; i < (full ? 16 : 8); i++)
5970       {
5971         int8_t val = aarch64_get_vec_s8 (cpu, vs, i);
5972         aarch64_set_vec_s8 (cpu, vd, i, val >> shift);
5973       }
5974   else
5975     for (i = 0; i < (full ? 16 : 8); i++)
5976       {
5977         uint8_t val = aarch64_get_vec_u8 (cpu, vs, i);
5978         aarch64_set_vec_u8 (cpu, vd, i, val >> shift);
5979       }
5980 }
5981
5982 static void
5983 do_vec_MUL_by_element (sim_cpu *cpu)
5984 {
5985   /* instr[31]    = 0
5986      instr[30]    = half/full
5987      instr[29,24] = 00 1111
5988      instr[23,22] = size
5989      instr[21]    = L
5990      instr[20]    = M
5991      instr[19,16] = m
5992      instr[15,12] = 1000
5993      instr[11]    = H
5994      instr[10]    = 0
5995      instr[9,5]   = Vn
5996      instr[4,0]   = Vd  */
5997
5998   unsigned full     = INSTR (30, 30);
5999   unsigned L        = INSTR (21, 21);
6000   unsigned H        = INSTR (11, 11);
6001   unsigned vn       = INSTR (9, 5);
6002   unsigned vd       = INSTR (4, 0);
6003   unsigned size     = INSTR (23, 22);
6004   unsigned index;
6005   unsigned vm;
6006   unsigned e;
6007
6008   NYI_assert (29, 24, 0x0F);
6009   NYI_assert (15, 12, 0x8);
6010   NYI_assert (10, 10, 0);
6011
6012   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6013   switch (size)
6014     {
6015     case 1:
6016       {
6017         /* 16 bit products.  */
6018         uint16_t product;
6019         uint16_t element1;
6020         uint16_t element2;
6021
6022         index = (H << 2) | (L << 1) | INSTR (20, 20);
6023         vm = INSTR (19, 16);
6024         element2 = aarch64_get_vec_u16 (cpu, vm, index);
6025
6026         for (e = 0; e < (full ? 8 : 4); e ++)
6027           {
6028             element1 = aarch64_get_vec_u16 (cpu, vn, e);
6029             product  = element1 * element2;
6030             aarch64_set_vec_u16 (cpu, vd, e, product);
6031           }
6032       }
6033       break;
6034
6035     case 2:
6036       {
6037         /* 32 bit products.  */
6038         uint32_t product;
6039         uint32_t element1;
6040         uint32_t element2;
6041
6042         index = (H << 1) | L;
6043         vm = INSTR (20, 16);
6044         element2 = aarch64_get_vec_u32 (cpu, vm, index);
6045
6046         for (e = 0; e < (full ? 4 : 2); e ++)
6047           {
6048             element1 = aarch64_get_vec_u32 (cpu, vn, e);
6049             product  = element1 * element2;
6050             aarch64_set_vec_u32 (cpu, vd, e, product);
6051           }
6052       }
6053       break;
6054
6055     default:
6056       HALT_UNALLOC;
6057     }
6058 }
6059
6060 static void
6061 do_FMLA_by_element (sim_cpu *cpu)
6062 {
6063   /* instr[31]    = 0
6064      instr[30]    = half/full
6065      instr[29,23] = 00 1111 1
6066      instr[22]    = size
6067      instr[21]    = L
6068      instr[20,16] = m
6069      instr[15,12] = 0001
6070      instr[11]    = H
6071      instr[10]    = 0
6072      instr[9,5]   = Vn
6073      instr[4,0]   = Vd  */
6074
6075   unsigned full     = INSTR (30, 30);
6076   unsigned size     = INSTR (22, 22);
6077   unsigned L        = INSTR (21, 21);
6078   unsigned vm       = INSTR (20, 16);
6079   unsigned H        = INSTR (11, 11);
6080   unsigned vn       = INSTR (9, 5);
6081   unsigned vd       = INSTR (4, 0);
6082   unsigned e;
6083
6084   NYI_assert (29, 23, 0x1F);
6085   NYI_assert (15, 12, 0x1);
6086   NYI_assert (10, 10, 0);
6087
6088   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6089   if (size)
6090     {
6091       double element1, element2;
6092
6093       if (! full || L)
6094         HALT_UNALLOC;
6095
6096       element2 = aarch64_get_vec_double (cpu, vm, H);
6097
6098       for (e = 0; e < 2; e++)
6099         {
6100           element1 = aarch64_get_vec_double (cpu, vn, e);
6101           element1 *= element2;
6102           element1 += aarch64_get_vec_double (cpu, vd, e);
6103           aarch64_set_vec_double (cpu, vd, e, element1);
6104         }
6105     }
6106   else
6107     {
6108       float element1;
6109       float element2 = aarch64_get_vec_float (cpu, vm, (H << 1) | L);
6110
6111       for (e = 0; e < (full ? 4 : 2); e++)
6112         {
6113           element1 = aarch64_get_vec_float (cpu, vn, e);
6114           element1 *= element2;
6115           element1 += aarch64_get_vec_float (cpu, vd, e);
6116           aarch64_set_vec_float (cpu, vd, e, element1);
6117         }
6118     }
6119 }
6120
6121 static void
6122 do_vec_op2 (sim_cpu *cpu)
6123 {
6124   /* instr[31]    = 0
6125      instr[30]    = half/full
6126      instr[29,24] = 00 1111
6127      instr[23]    = ?
6128      instr[22,16] = element size & index
6129      instr[15,10] = sub-opcode
6130      instr[9,5]   = Vm
6131      instr[4,0]   = Vd  */
6132
6133   NYI_assert (29, 24, 0x0F);
6134
6135   if (INSTR (23, 23) != 0)
6136     {
6137       switch (INSTR (15, 10))
6138         {
6139         case 0x04:
6140         case 0x06:
6141           do_FMLA_by_element (cpu);
6142           return;
6143
6144         case 0x20:
6145         case 0x22:
6146           do_vec_MUL_by_element (cpu);
6147           return;
6148
6149         default:
6150           HALT_NYI;
6151         }
6152     }
6153   else
6154     {
6155       switch (INSTR (15, 10))
6156         {
6157         case 0x01: do_vec_SSHR_USHR (cpu); return;
6158         case 0x15: do_vec_SHL (cpu); return;
6159         case 0x20:
6160         case 0x22: do_vec_MUL_by_element (cpu); return;
6161         case 0x29: do_vec_xtl (cpu); return;
6162         default:   HALT_NYI;
6163         }
6164     }
6165 }
6166
6167 static void
6168 do_vec_neg (sim_cpu *cpu)
6169 {
6170   /* instr[31]    = 0
6171      instr[30]    = full(1)/half(0)
6172      instr[29,24] = 10 1110
6173      instr[23,22] = size: byte(00), half (01), word (10), long (11)
6174      instr[21,10] = 1000 0010 1110
6175      instr[9,5]   = Vs
6176      instr[4,0]   = Vd  */
6177
6178   int    full = INSTR (30, 30);
6179   unsigned vs = INSTR (9, 5);
6180   unsigned vd = INSTR (4, 0);
6181   unsigned i;
6182
6183   NYI_assert (29, 24, 0x2E);
6184   NYI_assert (21, 10, 0x82E);
6185
6186   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6187   switch (INSTR (23, 22))
6188     {
6189     case 0:
6190       for (i = 0; i < (full ? 16 : 8); i++)
6191         aarch64_set_vec_s8 (cpu, vd, i, - aarch64_get_vec_s8 (cpu, vs, i));
6192       return;
6193
6194     case 1:
6195       for (i = 0; i < (full ? 8 : 4); i++)
6196         aarch64_set_vec_s16 (cpu, vd, i, - aarch64_get_vec_s16 (cpu, vs, i));
6197       return;
6198
6199     case 2:
6200       for (i = 0; i < (full ? 4 : 2); i++)
6201         aarch64_set_vec_s32 (cpu, vd, i, - aarch64_get_vec_s32 (cpu, vs, i));
6202       return;
6203
6204     case 3:
6205       if (! full)
6206         HALT_NYI;
6207       for (i = 0; i < 2; i++)
6208         aarch64_set_vec_s64 (cpu, vd, i, - aarch64_get_vec_s64 (cpu, vs, i));
6209       return;
6210     }
6211 }
6212
6213 static void
6214 do_vec_sqrt (sim_cpu *cpu)
6215 {
6216   /* instr[31]    = 0
6217      instr[30]    = full(1)/half(0)
6218      instr[29,23] = 101 1101
6219      instr[22]    = single(0)/double(1)
6220      instr[21,10] = 1000 0111 1110
6221      instr[9,5]   = Vs
6222      instr[4,0]   = Vd.  */
6223
6224   int    full = INSTR (30, 30);
6225   unsigned vs = INSTR (9, 5);
6226   unsigned vd = INSTR (4, 0);
6227   unsigned i;
6228
6229   NYI_assert (29, 23, 0x5B);
6230   NYI_assert (21, 10, 0x87E);
6231
6232   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6233   if (INSTR (22, 22) == 0)
6234     for (i = 0; i < (full ? 4 : 2); i++)
6235       aarch64_set_vec_float (cpu, vd, i,
6236                              sqrtf (aarch64_get_vec_float (cpu, vs, i)));
6237   else
6238     for (i = 0; i < 2; i++)
6239       aarch64_set_vec_double (cpu, vd, i,
6240                               sqrt (aarch64_get_vec_double (cpu, vs, i)));
6241 }
6242
6243 static void
6244 do_vec_mls_indexed (sim_cpu *cpu)
6245 {
6246   /* instr[31]       = 0
6247      instr[30]       = half(0)/full(1)
6248      instr[29,24]    = 10 1111
6249      instr[23,22]    = 16-bit(01)/32-bit(10)
6250      instr[21,20+11] = index (if 16-bit)
6251      instr[21+11]    = index (if 32-bit)
6252      instr[20,16]    = Vm
6253      instr[15,12]    = 0100
6254      instr[11]       = part of index
6255      instr[10]       = 0
6256      instr[9,5]      = Vs
6257      instr[4,0]      = Vd.  */
6258
6259   int    full = INSTR (30, 30);
6260   unsigned vs = INSTR (9, 5);
6261   unsigned vd = INSTR (4, 0);
6262   unsigned vm = INSTR (20, 16);
6263   unsigned i;
6264
6265   NYI_assert (15, 12, 4);
6266   NYI_assert (10, 10, 0);
6267
6268   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6269   switch (INSTR (23, 22))
6270     {
6271     case 1:
6272       {
6273         unsigned elem;
6274         uint32_t val;
6275
6276         if (vm > 15)
6277           HALT_NYI;
6278
6279         elem = (INSTR (21, 20) << 1) | INSTR (11, 11);
6280         val = aarch64_get_vec_u16 (cpu, vm, elem);
6281
6282         for (i = 0; i < (full ? 8 : 4); i++)
6283           aarch64_set_vec_u32 (cpu, vd, i,
6284                                aarch64_get_vec_u32 (cpu, vd, i) -
6285                                (aarch64_get_vec_u32 (cpu, vs, i) * val));
6286         return;
6287       }
6288
6289     case 2:
6290       {
6291         unsigned elem = (INSTR (21, 21) << 1) | INSTR (11, 11);
6292         uint64_t val = aarch64_get_vec_u32 (cpu, vm, elem);
6293
6294         for (i = 0; i < (full ? 4 : 2); i++)
6295           aarch64_set_vec_u64 (cpu, vd, i,
6296                                aarch64_get_vec_u64 (cpu, vd, i) -
6297                                (aarch64_get_vec_u64 (cpu, vs, i) * val));
6298         return;
6299       }
6300
6301     case 0:
6302     case 3:
6303     default:
6304       HALT_NYI;
6305     }
6306 }
6307
6308 static void
6309 do_vec_SUB (sim_cpu *cpu)
6310 {
6311   /* instr [31]    = 0
6312      instr [30]    = half(0)/full(1)
6313      instr [29,24] = 10 1110
6314      instr [23,22] = size: byte(00, half(01), word (10), long (11)
6315      instr [21]    = 1
6316      instr [20,16] = Vm
6317      instr [15,10] = 10 0001
6318      instr [9, 5]  = Vn
6319      instr [4, 0]  = Vd.  */
6320
6321   unsigned full = INSTR (30, 30);
6322   unsigned vm = INSTR (20, 16);
6323   unsigned vn = INSTR (9, 5);
6324   unsigned vd = INSTR (4, 0);
6325   unsigned i;
6326
6327   NYI_assert (29, 24, 0x2E);
6328   NYI_assert (21, 21, 1);
6329   NYI_assert (15, 10, 0x21);
6330
6331   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6332   switch (INSTR (23, 22))
6333     {
6334     case 0:
6335       for (i = 0; i < (full ? 16 : 8); i++)
6336         aarch64_set_vec_s8 (cpu, vd, i,
6337                             aarch64_get_vec_s8 (cpu, vn, i)
6338                             - aarch64_get_vec_s8 (cpu, vm, i));
6339       return;
6340
6341     case 1:
6342       for (i = 0; i < (full ? 8 : 4); i++)
6343         aarch64_set_vec_s16 (cpu, vd, i,
6344                              aarch64_get_vec_s16 (cpu, vn, i)
6345                              - aarch64_get_vec_s16 (cpu, vm, i));
6346       return;
6347
6348     case 2:
6349       for (i = 0; i < (full ? 4 : 2); i++)
6350         aarch64_set_vec_s32 (cpu, vd, i,
6351                              aarch64_get_vec_s32 (cpu, vn, i)
6352                              - aarch64_get_vec_s32 (cpu, vm, i));
6353       return;
6354
6355     case 3:
6356       if (full == 0)
6357         HALT_UNALLOC;
6358
6359       for (i = 0; i < 2; i++)
6360         aarch64_set_vec_s64 (cpu, vd, i,
6361                              aarch64_get_vec_s64 (cpu, vn, i)
6362                              - aarch64_get_vec_s64 (cpu, vm, i));
6363       return;
6364     }
6365 }
6366
6367 static void
6368 do_vec_MLS (sim_cpu *cpu)
6369 {
6370   /* instr [31]    = 0
6371      instr [30]    = half(0)/full(1)
6372      instr [29,24] = 10 1110
6373      instr [23,22] = size: byte(00, half(01), word (10)
6374      instr [21]    = 1
6375      instr [20,16] = Vm
6376      instr [15,10] = 10 0101
6377      instr [9, 5]  = Vn
6378      instr [4, 0]  = Vd.  */
6379
6380   unsigned full = INSTR (30, 30);
6381   unsigned vm = INSTR (20, 16);
6382   unsigned vn = INSTR (9, 5);
6383   unsigned vd = INSTR (4, 0);
6384   unsigned i;
6385
6386   NYI_assert (29, 24, 0x2E);
6387   NYI_assert (21, 21, 1);
6388   NYI_assert (15, 10, 0x25);
6389
6390   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6391   switch (INSTR (23, 22))
6392     {
6393     case 0:
6394       for (i = 0; i < (full ? 16 : 8); i++)
6395         aarch64_set_vec_u8 (cpu, vd, i,
6396                             aarch64_get_vec_u8 (cpu, vd, i)
6397                             - (aarch64_get_vec_u8 (cpu, vn, i)
6398                                * aarch64_get_vec_u8 (cpu, vm, i)));
6399       return;
6400
6401     case 1:
6402       for (i = 0; i < (full ? 8 : 4); i++)
6403         aarch64_set_vec_u16 (cpu, vd, i,
6404                              aarch64_get_vec_u16 (cpu, vd, i)
6405                              - (aarch64_get_vec_u16 (cpu, vn, i)
6406                                 * aarch64_get_vec_u16 (cpu, vm, i)));
6407       return;
6408
6409     case 2:
6410       for (i = 0; i < (full ? 4 : 2); i++)
6411         aarch64_set_vec_u32 (cpu, vd, i,
6412                              aarch64_get_vec_u32 (cpu, vd, i)
6413                              - (aarch64_get_vec_u32 (cpu, vn, i)
6414                                 * aarch64_get_vec_u32 (cpu, vm, i)));
6415       return;
6416
6417     default:
6418       HALT_UNALLOC;
6419     }
6420 }
6421
6422 static void
6423 do_vec_FDIV (sim_cpu *cpu)
6424 {
6425   /* instr [31]    = 0
6426      instr [30]    = half(0)/full(1)
6427      instr [29,23] = 10 1110 0
6428      instr [22]    = float()/double(1)
6429      instr [21]    = 1
6430      instr [20,16] = Vm
6431      instr [15,10] = 1111 11
6432      instr [9, 5]  = Vn
6433      instr [4, 0]  = Vd.  */
6434
6435   unsigned full = INSTR (30, 30);
6436   unsigned vm = INSTR (20, 16);
6437   unsigned vn = INSTR (9, 5);
6438   unsigned vd = INSTR (4, 0);
6439   unsigned i;
6440
6441   NYI_assert (29, 23, 0x5C);
6442   NYI_assert (21, 21, 1);
6443   NYI_assert (15, 10, 0x3F);
6444
6445   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6446   if (INSTR (22, 22))
6447     {
6448       if (! full)
6449         HALT_UNALLOC;
6450
6451       for (i = 0; i < 2; i++)
6452         aarch64_set_vec_double (cpu, vd, i,
6453                                 aarch64_get_vec_double (cpu, vn, i)
6454                                 / aarch64_get_vec_double (cpu, vm, i));
6455     }
6456   else
6457     for (i = 0; i < (full ? 4 : 2); i++)
6458       aarch64_set_vec_float (cpu, vd, i,
6459                              aarch64_get_vec_float (cpu, vn, i)
6460                              / aarch64_get_vec_float (cpu, vm, i));
6461 }
6462
6463 static void
6464 do_vec_FMUL (sim_cpu *cpu)
6465 {
6466   /* instr [31]    = 0
6467      instr [30]    = half(0)/full(1)
6468      instr [29,23] = 10 1110 0
6469      instr [22]    = float(0)/double(1)
6470      instr [21]    = 1
6471      instr [20,16] = Vm
6472      instr [15,10] = 1101 11
6473      instr [9, 5]  = Vn
6474      instr [4, 0]  = Vd.  */
6475
6476   unsigned full = INSTR (30, 30);
6477   unsigned vm = INSTR (20, 16);
6478   unsigned vn = INSTR (9, 5);
6479   unsigned vd = INSTR (4, 0);
6480   unsigned i;
6481
6482   NYI_assert (29, 23, 0x5C);
6483   NYI_assert (21, 21, 1);
6484   NYI_assert (15, 10, 0x37);
6485
6486   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6487   if (INSTR (22, 22))
6488     {
6489       if (! full)
6490         HALT_UNALLOC;
6491
6492       for (i = 0; i < 2; i++)
6493         aarch64_set_vec_double (cpu, vd, i,
6494                                 aarch64_get_vec_double (cpu, vn, i)
6495                                 * aarch64_get_vec_double (cpu, vm, i));
6496     }
6497   else
6498     for (i = 0; i < (full ? 4 : 2); i++)
6499       aarch64_set_vec_float (cpu, vd, i,
6500                              aarch64_get_vec_float (cpu, vn, i)
6501                              * aarch64_get_vec_float (cpu, vm, i));
6502 }
6503
6504 static void
6505 do_vec_FADDP (sim_cpu *cpu)
6506 {
6507   /* instr [31]    = 0
6508      instr [30]    = half(0)/full(1)
6509      instr [29,23] = 10 1110 0
6510      instr [22]    = float(0)/double(1)
6511      instr [21]    = 1
6512      instr [20,16] = Vm
6513      instr [15,10] = 1101 01
6514      instr [9, 5]  = Vn
6515      instr [4, 0]  = Vd.  */
6516
6517   unsigned full = INSTR (30, 30);
6518   unsigned vm = INSTR (20, 16);
6519   unsigned vn = INSTR (9, 5);
6520   unsigned vd = INSTR (4, 0);
6521
6522   NYI_assert (29, 23, 0x5C);
6523   NYI_assert (21, 21, 1);
6524   NYI_assert (15, 10, 0x35);
6525
6526   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6527   if (INSTR (22, 22))
6528     {
6529       /* Extract values before adding them incase vd == vn/vm.  */
6530       double tmp1 = aarch64_get_vec_double (cpu, vn, 0);
6531       double tmp2 = aarch64_get_vec_double (cpu, vn, 1);
6532       double tmp3 = aarch64_get_vec_double (cpu, vm, 0);
6533       double tmp4 = aarch64_get_vec_double (cpu, vm, 1);
6534
6535       if (! full)
6536         HALT_UNALLOC;
6537
6538       aarch64_set_vec_double (cpu, vd, 0, tmp1 + tmp2);
6539       aarch64_set_vec_double (cpu, vd, 1, tmp3 + tmp4);
6540     }
6541   else
6542     {
6543       /* Extract values before adding them incase vd == vn/vm.  */
6544       float tmp1 = aarch64_get_vec_float (cpu, vn, 0);
6545       float tmp2 = aarch64_get_vec_float (cpu, vn, 1);
6546       float tmp5 = aarch64_get_vec_float (cpu, vm, 0);
6547       float tmp6 = aarch64_get_vec_float (cpu, vm, 1);
6548
6549       if (full)
6550         {
6551           float tmp3 = aarch64_get_vec_float (cpu, vn, 2);
6552           float tmp4 = aarch64_get_vec_float (cpu, vn, 3);
6553           float tmp7 = aarch64_get_vec_float (cpu, vm, 2);
6554           float tmp8 = aarch64_get_vec_float (cpu, vm, 3);
6555
6556           aarch64_set_vec_float (cpu, vd, 0, tmp1 + tmp2);
6557           aarch64_set_vec_float (cpu, vd, 1, tmp3 + tmp4);
6558           aarch64_set_vec_float (cpu, vd, 2, tmp5 + tmp6);
6559           aarch64_set_vec_float (cpu, vd, 3, tmp7 + tmp8);
6560         }
6561       else
6562         {
6563           aarch64_set_vec_float (cpu, vd, 0, tmp1 + tmp2);
6564           aarch64_set_vec_float (cpu, vd, 1, tmp5 + tmp6);
6565         }
6566     }
6567 }
6568
6569 static void
6570 do_vec_FSQRT (sim_cpu *cpu)
6571 {
6572   /* instr[31]    = 0
6573      instr[30]    = half(0)/full(1)
6574      instr[29,23] = 10 1110 1
6575      instr[22]    = single(0)/double(1)
6576      instr[21,10] = 10 0001 1111 10
6577      instr[9,5]   = Vsrc
6578      instr[4,0]   = Vdest.  */
6579
6580   unsigned vn = INSTR (9, 5);
6581   unsigned vd = INSTR (4, 0);
6582   unsigned full = INSTR (30, 30);
6583   int i;
6584
6585   NYI_assert (29, 23, 0x5D);
6586   NYI_assert (21, 10, 0x87E);
6587
6588   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6589   if (INSTR (22, 22))
6590     {
6591       if (! full)
6592         HALT_UNALLOC;
6593
6594       for (i = 0; i < 2; i++)
6595         aarch64_set_vec_double (cpu, vd, i,
6596                                 sqrt (aarch64_get_vec_double (cpu, vn, i)));
6597     }
6598   else
6599     {
6600       for (i = 0; i < (full ? 4 : 2); i++)
6601         aarch64_set_vec_float (cpu, vd, i,
6602                                sqrtf (aarch64_get_vec_float (cpu, vn, i)));
6603     }
6604 }
6605
6606 static void
6607 do_vec_FNEG (sim_cpu *cpu)
6608 {
6609   /* instr[31]    = 0
6610      instr[30]    = half (0)/full (1)
6611      instr[29,23] = 10 1110 1
6612      instr[22]    = single (0)/double (1)
6613      instr[21,10] = 10 0000 1111 10
6614      instr[9,5]   = Vsrc
6615      instr[4,0]   = Vdest.  */
6616
6617   unsigned vn = INSTR (9, 5);
6618   unsigned vd = INSTR (4, 0);
6619   unsigned full = INSTR (30, 30);
6620   int i;
6621
6622   NYI_assert (29, 23, 0x5D);
6623   NYI_assert (21, 10, 0x83E);
6624
6625   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6626   if (INSTR (22, 22))
6627     {
6628       if (! full)
6629         HALT_UNALLOC;
6630
6631       for (i = 0; i < 2; i++)
6632         aarch64_set_vec_double (cpu, vd, i,
6633                                 - aarch64_get_vec_double (cpu, vn, i));
6634     }
6635   else
6636     {
6637       for (i = 0; i < (full ? 4 : 2); i++)
6638         aarch64_set_vec_float (cpu, vd, i,
6639                                - aarch64_get_vec_float (cpu, vn, i));
6640     }
6641 }
6642
6643 static void
6644 do_vec_NOT (sim_cpu *cpu)
6645 {
6646   /* instr[31]    = 0
6647      instr[30]    = half (0)/full (1)
6648      instr[29,10] = 10 1110 0010 0000 0101 10
6649      instr[9,5]   = Vn
6650      instr[4.0]   = Vd.  */
6651
6652   unsigned vn = INSTR (9, 5);
6653   unsigned vd = INSTR (4, 0);
6654   unsigned i;
6655   int      full = INSTR (30, 30);
6656
6657   NYI_assert (29, 10, 0xB8816);
6658
6659   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6660   for (i = 0; i < (full ? 16 : 8); i++)
6661     aarch64_set_vec_u8 (cpu, vd, i, ~ aarch64_get_vec_u8 (cpu, vn, i));
6662 }
6663
6664 static unsigned int
6665 clz (uint64_t val, unsigned size)
6666 {
6667   uint64_t mask = 1;
6668   int      count;
6669
6670   mask <<= (size - 1);
6671   count = 0;
6672   do
6673     {
6674       if (val & mask)
6675         break;
6676       mask >>= 1;
6677       count ++;
6678     }
6679   while (mask);
6680
6681   return count;
6682 }
6683
6684 static void
6685 do_vec_CLZ (sim_cpu *cpu)
6686 {
6687   /* instr[31]    = 0
6688      instr[30]    = half (0)/full (1)
6689      instr[29,24] = 10 1110
6690      instr[23,22] = size
6691      instr[21,10] = 10 0000 0100 10
6692      instr[9,5]   = Vn
6693      instr[4.0]   = Vd.  */
6694
6695   unsigned vn = INSTR (9, 5);
6696   unsigned vd = INSTR (4, 0);
6697   unsigned i;
6698   int      full = INSTR (30,30);
6699
6700   NYI_assert (29, 24, 0x2E);
6701   NYI_assert (21, 10, 0x812);
6702
6703   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6704   switch (INSTR (23, 22))
6705     {
6706     case 0:
6707       for (i = 0; i < (full ? 16 : 8); i++)
6708         aarch64_set_vec_u8 (cpu, vd, i, clz (aarch64_get_vec_u8 (cpu, vn, i), 8));
6709       break;
6710     case 1:
6711       for (i = 0; i < (full ? 8 : 4); i++)
6712         aarch64_set_vec_u16 (cpu, vd, i, clz (aarch64_get_vec_u16 (cpu, vn, i), 16));
6713       break;
6714     case 2:
6715       for (i = 0; i < (full ? 4 : 2); i++)
6716         aarch64_set_vec_u32 (cpu, vd, i, clz (aarch64_get_vec_u32 (cpu, vn, i), 32));
6717       break;
6718     case 3:
6719       if (! full)
6720         HALT_UNALLOC;
6721       aarch64_set_vec_u64 (cpu, vd, 0, clz (aarch64_get_vec_u64 (cpu, vn, 0), 64));
6722       aarch64_set_vec_u64 (cpu, vd, 1, clz (aarch64_get_vec_u64 (cpu, vn, 1), 64));
6723       break;
6724     }
6725 }
6726
6727 static void
6728 do_vec_MOV_element (sim_cpu *cpu)
6729 {
6730   /* instr[31,21] = 0110 1110 000
6731      instr[20,16] = size & dest index
6732      instr[15]    = 0
6733      instr[14,11] = source index
6734      instr[10]    = 1
6735      instr[9,5]   = Vs
6736      instr[4.0]   = Vd.  */
6737
6738   unsigned vs = INSTR (9, 5);
6739   unsigned vd = INSTR (4, 0);
6740   unsigned src_index;
6741   unsigned dst_index;
6742
6743   NYI_assert (31, 21, 0x370);
6744   NYI_assert (15, 15, 0);
6745   NYI_assert (10, 10, 1);
6746
6747   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6748   if (INSTR (16, 16))
6749     {
6750       /* Move a byte.  */
6751       src_index = INSTR (14, 11);
6752       dst_index = INSTR (20, 17);
6753       aarch64_set_vec_u8 (cpu, vd, dst_index,
6754                           aarch64_get_vec_u8 (cpu, vs, src_index));
6755     }
6756   else if (INSTR (17, 17))
6757     {
6758       /* Move 16-bits.  */
6759       NYI_assert (11, 11, 0);
6760       src_index = INSTR (14, 12);
6761       dst_index = INSTR (20, 18);
6762       aarch64_set_vec_u16 (cpu, vd, dst_index,
6763                            aarch64_get_vec_u16 (cpu, vs, src_index));
6764     }
6765   else if (INSTR (18, 18))
6766     {
6767       /* Move 32-bits.  */
6768       NYI_assert (12, 11, 0);
6769       src_index = INSTR (14, 13);
6770       dst_index = INSTR (20, 19);
6771       aarch64_set_vec_u32 (cpu, vd, dst_index,
6772                            aarch64_get_vec_u32 (cpu, vs, src_index));
6773     }
6774   else
6775     {
6776       NYI_assert (19, 19, 1);
6777       NYI_assert (13, 11, 0);
6778       src_index = INSTR (14, 14);
6779       dst_index = INSTR (20, 20);
6780       aarch64_set_vec_u64 (cpu, vd, dst_index,
6781                            aarch64_get_vec_u64 (cpu, vs, src_index));
6782     }
6783 }
6784
6785 static void
6786 do_vec_REV32 (sim_cpu *cpu)
6787 {
6788   /* instr[31]    = 0
6789      instr[30]    = full/half
6790      instr[29,24] = 10 1110
6791      instr[23,22] = size
6792      instr[21,10] = 10 0000 0000 10
6793      instr[9,5]   = Rn
6794      instr[4,0]   = Rd.  */
6795
6796   unsigned rn = INSTR (9, 5);
6797   unsigned rd = INSTR (4, 0);
6798   unsigned size = INSTR (23, 22);
6799   unsigned full = INSTR (30, 30);
6800   unsigned i;
6801   FRegister val;
6802
6803   NYI_assert (29, 24, 0x2E);
6804   NYI_assert (21, 10, 0x802);
6805
6806   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6807   switch (size)
6808     {
6809     case 0:
6810       for (i = 0; i < (full ? 16 : 8); i++)
6811         val.b[i ^ 0x3] = aarch64_get_vec_u8 (cpu, rn, i);
6812       break;
6813
6814     case 1:
6815       for (i = 0; i < (full ? 8 : 4); i++)
6816         val.h[i ^ 0x1] = aarch64_get_vec_u16 (cpu, rn, i);
6817       break;
6818
6819     default:
6820       HALT_UNALLOC;
6821     }
6822
6823   aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
6824   if (full)
6825     aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
6826 }
6827
6828 static void
6829 do_vec_EXT (sim_cpu *cpu)
6830 {
6831   /* instr[31]    = 0
6832      instr[30]    = full/half
6833      instr[29,21] = 10 1110 000
6834      instr[20,16] = Vm
6835      instr[15]    = 0
6836      instr[14,11] = source index
6837      instr[10]    = 0
6838      instr[9,5]   = Vn
6839      instr[4.0]   = Vd.  */
6840
6841   unsigned vm = INSTR (20, 16);
6842   unsigned vn = INSTR (9, 5);
6843   unsigned vd = INSTR (4, 0);
6844   unsigned src_index = INSTR (14, 11);
6845   unsigned full = INSTR (30, 30);
6846   unsigned i;
6847   unsigned j;
6848   FRegister val;
6849
6850   NYI_assert (31, 21, 0x370);
6851   NYI_assert (15, 15, 0);
6852   NYI_assert (10, 10, 0);
6853
6854   if (!full && (src_index & 0x8))
6855     HALT_UNALLOC;
6856
6857   j = 0;
6858
6859   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6860   for (i = src_index; i < (full ? 16 : 8); i++)
6861     val.b[j ++] = aarch64_get_vec_u8 (cpu, vn, i);
6862   for (i = 0; i < src_index; i++)
6863     val.b[j ++] = aarch64_get_vec_u8 (cpu, vm, i);
6864
6865   aarch64_set_vec_u64 (cpu, vd, 0, val.v[0]);
6866   if (full)
6867     aarch64_set_vec_u64 (cpu, vd, 1, val.v[1]);
6868 }
6869
6870 static void
6871 dexAdvSIMD0 (sim_cpu *cpu)
6872 {
6873   /* instr [28,25] = 0 111.  */
6874   if (    INSTR (15, 10) == 0x07
6875       && (INSTR (9, 5) ==
6876           INSTR (20, 16)))
6877     {
6878       if (INSTR (31, 21) == 0x075
6879           || INSTR (31, 21) == 0x275)
6880         {
6881           do_vec_MOV_whole_vector (cpu);
6882           return;
6883         }
6884     }
6885
6886   if (INSTR (29, 19) == 0x1E0)
6887     {
6888       do_vec_MOV_immediate (cpu);
6889       return;
6890     }
6891
6892   if (INSTR (29, 19) == 0x5E0)
6893     {
6894       do_vec_MVNI (cpu);
6895       return;
6896     }
6897
6898   if (INSTR (29, 19) == 0x1C0
6899       || INSTR (29, 19) == 0x1C1)
6900     {
6901       if (INSTR (15, 10) == 0x03)
6902         {
6903           do_vec_DUP_scalar_into_vector (cpu);
6904           return;
6905         }
6906     }
6907
6908   switch (INSTR (29, 24))
6909     {
6910     case 0x0E: do_vec_op1 (cpu); return;
6911     case 0x0F: do_vec_op2 (cpu); return;
6912
6913     case 0x2E:
6914       if (INSTR (21, 21) == 1)
6915         {
6916           switch (INSTR (15, 10))
6917             {
6918             case 0x02:
6919               do_vec_REV32 (cpu);
6920               return;
6921
6922             case 0x07:
6923               switch (INSTR (23, 22))
6924                 {
6925                 case 0: do_vec_EOR (cpu); return;
6926                 case 1: do_vec_BSL (cpu); return;
6927                 case 2:
6928                 case 3: do_vec_bit (cpu); return;
6929                 }
6930               break;
6931
6932             case 0x08: do_vec_sub_long (cpu); return;
6933             case 0x11: do_vec_USHL (cpu); return;
6934             case 0x12: do_vec_CLZ (cpu); return;
6935             case 0x16: do_vec_NOT (cpu); return;
6936             case 0x19: do_vec_max (cpu); return;
6937             case 0x1B: do_vec_min (cpu); return;
6938             case 0x21: do_vec_SUB (cpu); return;
6939             case 0x25: do_vec_MLS (cpu); return;
6940             case 0x31: do_vec_FminmaxNMP (cpu); return;
6941             case 0x35: do_vec_FADDP (cpu); return;
6942             case 0x37: do_vec_FMUL (cpu); return;
6943             case 0x3F: do_vec_FDIV (cpu); return;
6944
6945             case 0x3E:
6946               switch (INSTR (20, 16))
6947                 {
6948                 case 0x00: do_vec_FNEG (cpu); return;
6949                 case 0x01: do_vec_FSQRT (cpu); return;
6950                 default:   HALT_NYI;
6951                 }
6952
6953             case 0x0D:
6954             case 0x0F:
6955             case 0x22:
6956             case 0x23:
6957             case 0x26:
6958             case 0x2A:
6959             case 0x32:
6960             case 0x36:
6961             case 0x39:
6962             case 0x3A:
6963               do_vec_compare (cpu); return;
6964
6965             default:
6966               break;
6967             }
6968         }
6969
6970       if (INSTR (31, 21) == 0x370)
6971         {
6972           if (INSTR (10, 10))
6973             do_vec_MOV_element (cpu);
6974           else
6975             do_vec_EXT (cpu);
6976           return;
6977         }
6978
6979       switch (INSTR (21, 10))
6980         {
6981         case 0x82E: do_vec_neg (cpu); return;
6982         case 0x87E: do_vec_sqrt (cpu); return;
6983         default:
6984           if (INSTR (15, 10) == 0x30)
6985             {
6986               do_vec_mull (cpu);
6987               return;
6988             }
6989           break;
6990         }
6991       break;
6992
6993     case 0x2f:
6994       switch (INSTR (15, 10))
6995         {
6996         case 0x01: do_vec_SSHR_USHR (cpu); return;
6997         case 0x10:
6998         case 0x12: do_vec_mls_indexed (cpu); return;
6999         case 0x29: do_vec_xtl (cpu); return;
7000         default:
7001           HALT_NYI;
7002         }
7003
7004     default:
7005       break;
7006     }
7007
7008   HALT_NYI;
7009 }
7010
7011 /* 3 sources.  */
7012
7013 /* Float multiply add.  */
7014 static void
7015 fmadds (sim_cpu *cpu)
7016 {
7017   unsigned sa = INSTR (14, 10);
7018   unsigned sm = INSTR (20, 16);
7019   unsigned sn = INSTR ( 9,  5);
7020   unsigned sd = INSTR ( 4,  0);
7021
7022   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7023   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sa)
7024                         + aarch64_get_FP_float (cpu, sn)
7025                         * aarch64_get_FP_float (cpu, sm));
7026 }
7027
7028 /* Double multiply add.  */
7029 static void
7030 fmaddd (sim_cpu *cpu)
7031 {
7032   unsigned sa = INSTR (14, 10);
7033   unsigned sm = INSTR (20, 16);
7034   unsigned sn = INSTR ( 9,  5);
7035   unsigned sd = INSTR ( 4,  0);
7036
7037   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7038   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sa)
7039                          + aarch64_get_FP_double (cpu, sn)
7040                          * aarch64_get_FP_double (cpu, sm));
7041 }
7042
7043 /* Float multiply subtract.  */
7044 static void
7045 fmsubs (sim_cpu *cpu)
7046 {
7047   unsigned sa = INSTR (14, 10);
7048   unsigned sm = INSTR (20, 16);
7049   unsigned sn = INSTR ( 9,  5);
7050   unsigned sd = INSTR ( 4,  0);
7051
7052   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7053   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sa)
7054                         - aarch64_get_FP_float (cpu, sn)
7055                         * aarch64_get_FP_float (cpu, sm));
7056 }
7057
7058 /* Double multiply subtract.  */
7059 static void
7060 fmsubd (sim_cpu *cpu)
7061 {
7062   unsigned sa = INSTR (14, 10);
7063   unsigned sm = INSTR (20, 16);
7064   unsigned sn = INSTR ( 9,  5);
7065   unsigned sd = INSTR ( 4,  0);
7066
7067   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7068   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sa)
7069                          - aarch64_get_FP_double (cpu, sn)
7070                          * aarch64_get_FP_double (cpu, sm));
7071 }
7072
7073 /* Float negative multiply add.  */
7074 static void
7075 fnmadds (sim_cpu *cpu)
7076 {
7077   unsigned sa = INSTR (14, 10);
7078   unsigned sm = INSTR (20, 16);
7079   unsigned sn = INSTR ( 9,  5);
7080   unsigned sd = INSTR ( 4,  0);
7081
7082   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7083   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sa)
7084                         + (- aarch64_get_FP_float (cpu, sn))
7085                         * aarch64_get_FP_float (cpu, sm));
7086 }
7087
7088 /* Double negative multiply add.  */
7089 static void
7090 fnmaddd (sim_cpu *cpu)
7091 {
7092   unsigned sa = INSTR (14, 10);
7093   unsigned sm = INSTR (20, 16);
7094   unsigned sn = INSTR ( 9,  5);
7095   unsigned sd = INSTR ( 4,  0);
7096
7097   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7098   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sa)
7099                          + (- aarch64_get_FP_double (cpu, sn))
7100                          * aarch64_get_FP_double (cpu, sm));
7101 }
7102
7103 /* Float negative multiply subtract.  */
7104 static void
7105 fnmsubs (sim_cpu *cpu)
7106 {
7107   unsigned sa = INSTR (14, 10);
7108   unsigned sm = INSTR (20, 16);
7109   unsigned sn = INSTR ( 9,  5);
7110   unsigned sd = INSTR ( 4,  0);
7111
7112   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7113   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sa)
7114                         + aarch64_get_FP_float (cpu, sn)
7115                         * aarch64_get_FP_float (cpu, sm));
7116 }
7117
7118 /* Double negative multiply subtract.  */
7119 static void
7120 fnmsubd (sim_cpu *cpu)
7121 {
7122   unsigned sa = INSTR (14, 10);
7123   unsigned sm = INSTR (20, 16);
7124   unsigned sn = INSTR ( 9,  5);
7125   unsigned sd = INSTR ( 4,  0);
7126
7127   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7128   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sa)
7129                          + aarch64_get_FP_double (cpu, sn)
7130                          * aarch64_get_FP_double (cpu, sm));
7131 }
7132
7133 static void
7134 dexSimpleFPDataProc3Source (sim_cpu *cpu)
7135 {
7136   /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
7137      instr[30]    = 0
7138      instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
7139      instr[28,25] = 1111
7140      instr[24]    = 1
7141      instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
7142      instr[21]    ==> o1 : 0 ==> unnegated, 1 ==> negated
7143      instr[15]    ==> o2 : 0 ==> ADD, 1 ==> SUB  */
7144
7145   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
7146   /* dispatch on combined type:o1:o2.  */
7147   uint32_t dispatch = (INSTR (23, 21) << 1) | INSTR (15, 15);
7148
7149   if (M_S != 0)
7150     HALT_UNALLOC;
7151
7152   switch (dispatch)
7153     {
7154     case 0: fmadds (cpu); return;
7155     case 1: fmsubs (cpu); return;
7156     case 2: fnmadds (cpu); return;
7157     case 3: fnmsubs (cpu); return;
7158     case 4: fmaddd (cpu); return;
7159     case 5: fmsubd (cpu); return;
7160     case 6: fnmaddd (cpu); return;
7161     case 7: fnmsubd (cpu); return;
7162     default:
7163       /* type > 1 is currently unallocated.  */
7164       HALT_UNALLOC;
7165     }
7166 }
7167
7168 static void
7169 dexSimpleFPFixedConvert (sim_cpu *cpu)
7170 {
7171   HALT_NYI;
7172 }
7173
7174 static void
7175 dexSimpleFPCondCompare (sim_cpu *cpu)
7176 {
7177   /* instr [31,23] = 0001 1110 0
7178      instr [22]    = type
7179      instr [21]    = 1
7180      instr [20,16] = Rm
7181      instr [15,12] = condition
7182      instr [11,10] = 01
7183      instr [9,5]   = Rn
7184      instr [4]     = 0
7185      instr [3,0]   = nzcv  */
7186
7187   unsigned rm = INSTR (20, 16);
7188   unsigned rn = INSTR (9, 5);
7189
7190   NYI_assert (31, 23, 0x3C);
7191   NYI_assert (11, 10, 0x1);
7192   NYI_assert (4,  4,  0);
7193
7194   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7195   if (! testConditionCode (cpu, INSTR (15, 12)))
7196     {
7197       aarch64_set_CPSR (cpu, INSTR (3, 0));
7198       return;
7199     }
7200
7201   if (INSTR (22, 22))
7202     {
7203       /* Double precision.  */
7204       double val1 = aarch64_get_vec_double (cpu, rn, 0);
7205       double val2 = aarch64_get_vec_double (cpu, rm, 0);
7206
7207       /* FIXME: Check for NaNs.  */
7208       if (val1 == val2)
7209         aarch64_set_CPSR (cpu, (Z | C));
7210       else if (val1 < val2)
7211         aarch64_set_CPSR (cpu, N);
7212       else /* val1 > val2 */
7213         aarch64_set_CPSR (cpu, C);
7214     }
7215   else
7216     {
7217       /* Single precision.  */
7218       float val1 = aarch64_get_vec_float (cpu, rn, 0);
7219       float val2 = aarch64_get_vec_float (cpu, rm, 0);
7220
7221       /* FIXME: Check for NaNs.  */
7222       if (val1 == val2)
7223         aarch64_set_CPSR (cpu, (Z | C));
7224       else if (val1 < val2)
7225         aarch64_set_CPSR (cpu, N);
7226       else /* val1 > val2 */
7227         aarch64_set_CPSR (cpu, C);
7228     }
7229 }
7230
7231 /* 2 sources.  */
7232
7233 /* Float add.  */
7234 static void
7235 fadds (sim_cpu *cpu)
7236 {
7237   unsigned sm = INSTR (20, 16);
7238   unsigned sn = INSTR ( 9,  5);
7239   unsigned sd = INSTR ( 4,  0);
7240
7241   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7242   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7243                         + aarch64_get_FP_float (cpu, sm));
7244 }
7245
7246 /* Double add.  */
7247 static void
7248 faddd (sim_cpu *cpu)
7249 {
7250   unsigned sm = INSTR (20, 16);
7251   unsigned sn = INSTR ( 9,  5);
7252   unsigned sd = INSTR ( 4,  0);
7253
7254   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7255   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7256                          + aarch64_get_FP_double (cpu, sm));
7257 }
7258
7259 /* Float divide.  */
7260 static void
7261 fdivs (sim_cpu *cpu)
7262 {
7263   unsigned sm = INSTR (20, 16);
7264   unsigned sn = INSTR ( 9,  5);
7265   unsigned sd = INSTR ( 4,  0);
7266
7267   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7268   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7269                         / aarch64_get_FP_float (cpu, sm));
7270 }
7271
7272 /* Double divide.  */
7273 static void
7274 fdivd (sim_cpu *cpu)
7275 {
7276   unsigned sm = INSTR (20, 16);
7277   unsigned sn = INSTR ( 9,  5);
7278   unsigned sd = INSTR ( 4,  0);
7279
7280   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7281   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7282                          / aarch64_get_FP_double (cpu, sm));
7283 }
7284
7285 /* Float multiply.  */
7286 static void
7287 fmuls (sim_cpu *cpu)
7288 {
7289   unsigned sm = INSTR (20, 16);
7290   unsigned sn = INSTR ( 9,  5);
7291   unsigned sd = INSTR ( 4,  0);
7292
7293   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7294   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7295                         * aarch64_get_FP_float (cpu, sm));
7296 }
7297
7298 /* Double multiply.  */
7299 static void
7300 fmuld (sim_cpu *cpu)
7301 {
7302   unsigned sm = INSTR (20, 16);
7303   unsigned sn = INSTR ( 9,  5);
7304   unsigned sd = INSTR ( 4,  0);
7305
7306   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7307   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7308                          * aarch64_get_FP_double (cpu, sm));
7309 }
7310
7311 /* Float negate and multiply.  */
7312 static void
7313 fnmuls (sim_cpu *cpu)
7314 {
7315   unsigned sm = INSTR (20, 16);
7316   unsigned sn = INSTR ( 9,  5);
7317   unsigned sd = INSTR ( 4,  0);
7318
7319   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7320   aarch64_set_FP_float (cpu, sd, - (aarch64_get_FP_float (cpu, sn)
7321                                     * aarch64_get_FP_float (cpu, sm)));
7322 }
7323
7324 /* Double negate and multiply.  */
7325 static void
7326 fnmuld (sim_cpu *cpu)
7327 {
7328   unsigned sm = INSTR (20, 16);
7329   unsigned sn = INSTR ( 9,  5);
7330   unsigned sd = INSTR ( 4,  0);
7331
7332   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7333   aarch64_set_FP_double (cpu, sd, - (aarch64_get_FP_double (cpu, sn)
7334                                      * aarch64_get_FP_double (cpu, sm)));
7335 }
7336
7337 /* Float subtract.  */
7338 static void
7339 fsubs (sim_cpu *cpu)
7340 {
7341   unsigned sm = INSTR (20, 16);
7342   unsigned sn = INSTR ( 9,  5);
7343   unsigned sd = INSTR ( 4,  0);
7344
7345   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7346   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7347                         - aarch64_get_FP_float (cpu, sm));
7348 }
7349
7350 /* Double subtract.  */
7351 static void
7352 fsubd (sim_cpu *cpu)
7353 {
7354   unsigned sm = INSTR (20, 16);
7355   unsigned sn = INSTR ( 9,  5);
7356   unsigned sd = INSTR ( 4,  0);
7357
7358   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7359   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7360                          - aarch64_get_FP_double (cpu, sm));
7361 }
7362
7363 static void
7364 do_FMINNM (sim_cpu *cpu)
7365 {
7366   /* instr[31,23] = 0 0011 1100
7367      instr[22]    = float(0)/double(1)
7368      instr[21]    = 1
7369      instr[20,16] = Sm
7370      instr[15,10] = 01 1110
7371      instr[9,5]   = Sn
7372      instr[4,0]   = Cpu  */
7373
7374   unsigned sm = INSTR (20, 16);
7375   unsigned sn = INSTR ( 9,  5);
7376   unsigned sd = INSTR ( 4,  0);
7377
7378   NYI_assert (31, 23, 0x03C);
7379   NYI_assert (15, 10, 0x1E);
7380
7381   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7382   if (INSTR (22, 22))
7383     aarch64_set_FP_double (cpu, sd,
7384                            dminnm (aarch64_get_FP_double (cpu, sn),
7385                                    aarch64_get_FP_double (cpu, sm)));
7386   else
7387     aarch64_set_FP_float (cpu, sd,
7388                           fminnm (aarch64_get_FP_float (cpu, sn),
7389                                   aarch64_get_FP_float (cpu, sm)));
7390 }
7391
7392 static void
7393 do_FMAXNM (sim_cpu *cpu)
7394 {
7395   /* instr[31,23] = 0 0011 1100
7396      instr[22]    = float(0)/double(1)
7397      instr[21]    = 1
7398      instr[20,16] = Sm
7399      instr[15,10] = 01 1010
7400      instr[9,5]   = Sn
7401      instr[4,0]   = Cpu  */
7402
7403   unsigned sm = INSTR (20, 16);
7404   unsigned sn = INSTR ( 9,  5);
7405   unsigned sd = INSTR ( 4,  0);
7406
7407   NYI_assert (31, 23, 0x03C);
7408   NYI_assert (15, 10, 0x1A);
7409
7410   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7411   if (INSTR (22, 22))
7412     aarch64_set_FP_double (cpu, sd,
7413                            dmaxnm (aarch64_get_FP_double (cpu, sn),
7414                                    aarch64_get_FP_double (cpu, sm)));
7415   else
7416     aarch64_set_FP_float (cpu, sd,
7417                           fmaxnm (aarch64_get_FP_float (cpu, sn),
7418                                   aarch64_get_FP_float (cpu, sm)));
7419 }
7420
7421 static void
7422 dexSimpleFPDataProc2Source (sim_cpu *cpu)
7423 {
7424   /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
7425      instr[30]    = 0
7426      instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
7427      instr[28,25] = 1111
7428      instr[24]    = 0
7429      instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
7430      instr[21]    = 1
7431      instr[20,16] = Vm
7432      instr[15,12] ==> opcode : 0000 ==> FMUL, 0001 ==> FDIV
7433                                0010 ==> FADD, 0011 ==> FSUB,
7434                                0100 ==> FMAX, 0101 ==> FMIN
7435                                0110 ==> FMAXNM, 0111 ==> FMINNM
7436                                1000 ==> FNMUL, ow ==> UNALLOC
7437      instr[11,10] = 10
7438      instr[9,5]   = Vn
7439      instr[4,0]   = Vd  */
7440
7441   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
7442   uint32_t type = INSTR (23, 22);
7443   /* Dispatch on opcode.  */
7444   uint32_t dispatch = INSTR (15, 12);
7445
7446   if (type > 1)
7447     HALT_UNALLOC;
7448
7449   if (M_S != 0)
7450     HALT_UNALLOC;
7451
7452   if (type)
7453     switch (dispatch)
7454       {
7455       case 0: fmuld (cpu); return;
7456       case 1: fdivd (cpu); return;
7457       case 2: faddd (cpu); return;
7458       case 3: fsubd (cpu); return;
7459       case 6: do_FMAXNM (cpu); return;
7460       case 7: do_FMINNM (cpu); return;
7461       case 8: fnmuld (cpu); return;
7462
7463         /* Have not yet implemented fmax and fmin.  */
7464       case 4:
7465       case 5:
7466         HALT_NYI;
7467
7468       default:
7469         HALT_UNALLOC;
7470       }
7471   else /* type == 0 => floats.  */
7472     switch (dispatch)
7473       {
7474       case 0: fmuls (cpu); return;
7475       case 1: fdivs (cpu); return;
7476       case 2: fadds (cpu); return;
7477       case 3: fsubs (cpu); return;
7478       case 6: do_FMAXNM (cpu); return;
7479       case 7: do_FMINNM (cpu); return;
7480       case 8: fnmuls (cpu); return;
7481
7482       case 4:
7483       case 5:
7484         HALT_NYI;
7485
7486       default:
7487         HALT_UNALLOC;
7488       }
7489 }
7490
7491 static void
7492 dexSimpleFPCondSelect (sim_cpu *cpu)
7493 {
7494   /* FCSEL
7495      instr[31,23] = 0 0011 1100
7496      instr[22]    = 0=>single 1=>double
7497      instr[21]    = 1
7498      instr[20,16] = Sm
7499      instr[15,12] = cond
7500      instr[11,10] = 11
7501      instr[9,5]   = Sn
7502      instr[4,0]   = Cpu  */
7503   unsigned sm = INSTR (20, 16);
7504   unsigned sn = INSTR ( 9, 5);
7505   unsigned sd = INSTR ( 4, 0);
7506   uint32_t set = testConditionCode (cpu, INSTR (15, 12));
7507
7508   NYI_assert (31, 23, 0x03C);
7509   NYI_assert (11, 10, 0x3);
7510
7511   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7512   if (INSTR (22, 22))
7513     aarch64_set_FP_double (cpu, sd, (set ? aarch64_get_FP_double (cpu, sn)
7514                                      : aarch64_get_FP_double (cpu, sm)));
7515   else
7516     aarch64_set_FP_float (cpu, sd, (set ? aarch64_get_FP_float (cpu, sn)
7517                                     : aarch64_get_FP_float (cpu, sm)));
7518 }
7519
7520 /* Store 32 bit unscaled signed 9 bit.  */
7521 static void
7522 fsturs (sim_cpu *cpu, int32_t offset)
7523 {
7524   unsigned int rn = INSTR (9, 5);
7525   unsigned int st = INSTR (4, 0);
7526
7527   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7528   aarch64_set_mem_u32 (cpu, aarch64_get_reg_u64 (cpu, rn, 1) + offset,
7529                        aarch64_get_vec_u32 (cpu, st, 0));
7530 }
7531
7532 /* Store 64 bit unscaled signed 9 bit.  */
7533 static void
7534 fsturd (sim_cpu *cpu, int32_t offset)
7535 {
7536   unsigned int rn = INSTR (9, 5);
7537   unsigned int st = INSTR (4, 0);
7538
7539   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7540   aarch64_set_mem_u64 (cpu, aarch64_get_reg_u64 (cpu, rn, 1) + offset,
7541                        aarch64_get_vec_u64 (cpu, st, 0));
7542 }
7543
7544 /* Store 128 bit unscaled signed 9 bit.  */
7545 static void
7546 fsturq (sim_cpu *cpu, int32_t offset)
7547 {
7548   unsigned int rn = INSTR (9, 5);
7549   unsigned int st = INSTR (4, 0);
7550   FRegister a;
7551
7552   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7553   aarch64_get_FP_long_double (cpu, st, & a);
7554   aarch64_set_mem_long_double (cpu,
7555                                aarch64_get_reg_u64 (cpu, rn, 1)
7556                                + offset, a);
7557 }
7558
7559 /* TODO FP move register.  */
7560
7561 /* 32 bit fp to fp move register.  */
7562 static void
7563 ffmovs (sim_cpu *cpu)
7564 {
7565   unsigned int rn = INSTR (9, 5);
7566   unsigned int st = INSTR (4, 0);
7567
7568   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7569   aarch64_set_FP_float (cpu, st, aarch64_get_FP_float (cpu, rn));
7570 }
7571
7572 /* 64 bit fp to fp move register.  */
7573 static void
7574 ffmovd (sim_cpu *cpu)
7575 {
7576   unsigned int rn = INSTR (9, 5);
7577   unsigned int st = INSTR (4, 0);
7578
7579   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7580   aarch64_set_FP_double (cpu, st, aarch64_get_FP_double (cpu, rn));
7581 }
7582
7583 /* 32 bit GReg to Vec move register.  */
7584 static void
7585 fgmovs (sim_cpu *cpu)
7586 {
7587   unsigned int rn = INSTR (9, 5);
7588   unsigned int st = INSTR (4, 0);
7589
7590   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7591   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_reg_u32 (cpu, rn, NO_SP));
7592 }
7593
7594 /* 64 bit g to fp move register.  */
7595 static void
7596 fgmovd (sim_cpu *cpu)
7597 {
7598   unsigned int rn = INSTR (9, 5);
7599   unsigned int st = INSTR (4, 0);
7600
7601   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7602   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_reg_u64 (cpu, rn, NO_SP));
7603 }
7604
7605 /* 32 bit fp to g move register.  */
7606 static void
7607 gfmovs (sim_cpu *cpu)
7608 {
7609   unsigned int rn = INSTR (9, 5);
7610   unsigned int st = INSTR (4, 0);
7611
7612   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7613   aarch64_set_reg_u64 (cpu, st, NO_SP, aarch64_get_vec_u32 (cpu, rn, 0));
7614 }
7615
7616 /* 64 bit fp to g move register.  */
7617 static void
7618 gfmovd (sim_cpu *cpu)
7619 {
7620   unsigned int rn = INSTR (9, 5);
7621   unsigned int st = INSTR (4, 0);
7622
7623   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7624   aarch64_set_reg_u64 (cpu, st, NO_SP, aarch64_get_vec_u64 (cpu, rn, 0));
7625 }
7626
7627 /* FP move immediate
7628
7629    These install an immediate 8 bit value in the target register
7630    where the 8 bits comprise 1 sign bit, 4 bits of fraction and a 3
7631    bit exponent.  */
7632
7633 static void
7634 fmovs (sim_cpu *cpu)
7635 {
7636   unsigned int sd = INSTR (4, 0);
7637   uint32_t imm = INSTR (20, 13);
7638   float f = fp_immediate_for_encoding_32 (imm);
7639
7640   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7641   aarch64_set_FP_float (cpu, sd, f);
7642 }
7643
7644 static void
7645 fmovd (sim_cpu *cpu)
7646 {
7647   unsigned int sd = INSTR (4, 0);
7648   uint32_t imm = INSTR (20, 13);
7649   double d = fp_immediate_for_encoding_64 (imm);
7650
7651   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7652   aarch64_set_FP_double (cpu, sd, d);
7653 }
7654
7655 static void
7656 dexSimpleFPImmediate (sim_cpu *cpu)
7657 {
7658   /* instr[31,23] == 00111100
7659      instr[22]    == type : single(0)/double(1)
7660      instr[21]    == 1
7661      instr[20,13] == imm8
7662      instr[12,10] == 100
7663      instr[9,5]   == imm5 : 00000 ==> PK, ow ==> UNALLOC
7664      instr[4,0]   == Rd  */
7665   uint32_t imm5 = INSTR (9, 5);
7666
7667   NYI_assert (31, 23, 0x3C);
7668
7669   if (imm5 != 0)
7670     HALT_UNALLOC;
7671
7672   if (INSTR (22, 22))
7673     fmovd (cpu);
7674   else
7675     fmovs (cpu);
7676 }
7677
7678 /* TODO specific decode and execute for group Load Store.  */
7679
7680 /* TODO FP load/store single register (unscaled offset).  */
7681
7682 /* TODO load 8 bit unscaled signed 9 bit.  */
7683 /* TODO load 16 bit unscaled signed 9 bit.  */
7684
7685 /* Load 32 bit unscaled signed 9 bit.  */
7686 static void
7687 fldurs (sim_cpu *cpu, int32_t offset)
7688 {
7689   unsigned int rn = INSTR (9, 5);
7690   unsigned int st = INSTR (4, 0);
7691
7692   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7693   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32
7694                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
7695 }
7696
7697 /* Load 64 bit unscaled signed 9 bit.  */
7698 static void
7699 fldurd (sim_cpu *cpu, int32_t offset)
7700 {
7701   unsigned int rn = INSTR (9, 5);
7702   unsigned int st = INSTR (4, 0);
7703
7704   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7705   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_mem_u64
7706                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
7707 }
7708
7709 /* Load 128 bit unscaled signed 9 bit.  */
7710 static void
7711 fldurq (sim_cpu *cpu, int32_t offset)
7712 {
7713   unsigned int rn = INSTR (9, 5);
7714   unsigned int st = INSTR (4, 0);
7715   FRegister a;
7716   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
7717
7718   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7719   aarch64_get_mem_long_double (cpu, addr, & a);
7720   aarch64_set_FP_long_double (cpu, st, a);
7721 }
7722
7723 /* TODO store 8 bit unscaled signed 9 bit.  */
7724 /* TODO store 16 bit unscaled signed 9 bit.  */
7725
7726
7727 /* 1 source.  */
7728
7729 /* Float absolute value.  */
7730 static void
7731 fabss (sim_cpu *cpu)
7732 {
7733   unsigned sn = INSTR (9, 5);
7734   unsigned sd = INSTR (4, 0);
7735   float value = aarch64_get_FP_float (cpu, sn);
7736
7737   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7738   aarch64_set_FP_float (cpu, sd, fabsf (value));
7739 }
7740
7741 /* Double absolute value.  */
7742 static void
7743 fabcpu (sim_cpu *cpu)
7744 {
7745   unsigned sn = INSTR (9, 5);
7746   unsigned sd = INSTR (4, 0);
7747   double value = aarch64_get_FP_double (cpu, sn);
7748
7749   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7750   aarch64_set_FP_double (cpu, sd, fabs (value));
7751 }
7752
7753 /* Float negative value.  */
7754 static void
7755 fnegs (sim_cpu *cpu)
7756 {
7757   unsigned sn = INSTR (9, 5);
7758   unsigned sd = INSTR (4, 0);
7759
7760   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7761   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sn));
7762 }
7763
7764 /* Double negative value.  */
7765 static void
7766 fnegd (sim_cpu *cpu)
7767 {
7768   unsigned sn = INSTR (9, 5);
7769   unsigned sd = INSTR (4, 0);
7770
7771   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7772   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sn));
7773 }
7774
7775 /* Float square root.  */
7776 static void
7777 fsqrts (sim_cpu *cpu)
7778 {
7779   unsigned sn = INSTR (9, 5);
7780   unsigned sd = INSTR (4, 0);
7781
7782   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7783   aarch64_set_FP_float (cpu, sd, sqrtf (aarch64_get_FP_float (cpu, sn)));
7784 }
7785
7786 /* Double square root.  */
7787 static void
7788 fsqrtd (sim_cpu *cpu)
7789 {
7790   unsigned sn = INSTR (9, 5);
7791   unsigned sd = INSTR (4, 0);
7792
7793   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7794   aarch64_set_FP_double (cpu, sd,
7795                          sqrt (aarch64_get_FP_double (cpu, sn)));
7796 }
7797
7798 /* Convert double to float.  */
7799 static void
7800 fcvtds (sim_cpu *cpu)
7801 {
7802   unsigned sn = INSTR (9, 5);
7803   unsigned sd = INSTR (4, 0);
7804
7805   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7806   aarch64_set_FP_float (cpu, sd, (float) aarch64_get_FP_double (cpu, sn));
7807 }
7808
7809 /* Convert float to double.  */
7810 static void
7811 fcvtcpu (sim_cpu *cpu)
7812 {
7813   unsigned sn = INSTR (9, 5);
7814   unsigned sd = INSTR (4, 0);
7815
7816   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7817   aarch64_set_FP_double (cpu, sd, (double) aarch64_get_FP_float (cpu, sn));
7818 }
7819
7820 static void
7821 do_FRINT (sim_cpu *cpu)
7822 {
7823   /* instr[31,23] = 0001 1110 0
7824      instr[22]    = single(0)/double(1)
7825      instr[21,18] = 1001
7826      instr[17,15] = rounding mode
7827      instr[14,10] = 10000
7828      instr[9,5]   = source
7829      instr[4,0]   = dest  */
7830
7831   float val;
7832   unsigned rs = INSTR (9, 5);
7833   unsigned rd = INSTR (4, 0);
7834   unsigned int rmode = INSTR (17, 15);
7835
7836   NYI_assert (31, 23, 0x03C);
7837   NYI_assert (21, 18, 0x9);
7838   NYI_assert (14, 10, 0x10);
7839
7840   if (rmode == 6 || rmode == 7)
7841     /* FIXME: Add support for rmode == 6 exactness check.  */
7842     rmode = uimm (aarch64_get_FPSR (cpu), 23, 22);
7843
7844   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7845   if (INSTR (22, 22))
7846     {
7847       double val = aarch64_get_FP_double (cpu, rs);
7848
7849       switch (rmode)
7850         {
7851         case 0: /* mode N: nearest or even.  */
7852           {
7853             double rval = round (val);
7854
7855             if (val - rval == 0.5)
7856               {
7857                 if (((rval / 2.0) * 2.0) != rval)
7858                   rval += 1.0;
7859               }
7860
7861             aarch64_set_FP_double (cpu, rd, round (val));
7862             return;
7863           }
7864
7865         case 1: /* mode P: towards +inf.  */
7866           if (val < 0.0)
7867             aarch64_set_FP_double (cpu, rd, trunc (val));
7868           else
7869             aarch64_set_FP_double (cpu, rd, round (val));
7870           return;
7871
7872         case 2: /* mode M: towards -inf.  */
7873           if (val < 0.0)
7874             aarch64_set_FP_double (cpu, rd, round (val));
7875           else
7876             aarch64_set_FP_double (cpu, rd, trunc (val));
7877           return;
7878
7879         case 3: /* mode Z: towards 0.  */
7880           aarch64_set_FP_double (cpu, rd, trunc (val));
7881           return;
7882
7883         case 4: /* mode A: away from 0.  */
7884           aarch64_set_FP_double (cpu, rd, round (val));
7885           return;
7886
7887         case 6: /* mode X: use FPCR with exactness check.  */
7888         case 7: /* mode I: use FPCR mode.  */
7889           HALT_NYI;
7890
7891         default:
7892           HALT_UNALLOC;
7893         }
7894     }
7895
7896   val = aarch64_get_FP_float (cpu, rs);
7897
7898   switch (rmode)
7899     {
7900     case 0: /* mode N: nearest or even.  */
7901       {
7902         float rval = roundf (val);
7903
7904         if (val - rval == 0.5)
7905           {
7906             if (((rval / 2.0) * 2.0) != rval)
7907               rval += 1.0;
7908           }
7909
7910         aarch64_set_FP_float (cpu, rd, rval);
7911         return;
7912       }
7913
7914     case 1: /* mode P: towards +inf.  */
7915       if (val < 0.0)
7916         aarch64_set_FP_float (cpu, rd, truncf (val));
7917       else
7918         aarch64_set_FP_float (cpu, rd, roundf (val));
7919       return;
7920
7921     case 2: /* mode M: towards -inf.  */
7922       if (val < 0.0)
7923         aarch64_set_FP_float (cpu, rd, truncf (val));
7924       else
7925         aarch64_set_FP_float (cpu, rd, roundf (val));
7926       return;
7927
7928     case 3: /* mode Z: towards 0.  */
7929       aarch64_set_FP_float (cpu, rd, truncf (val));
7930       return;
7931
7932     case 4: /* mode A: away from 0.  */
7933       aarch64_set_FP_float (cpu, rd, roundf (val));
7934       return;
7935
7936     case 6: /* mode X: use FPCR with exactness check.  */
7937     case 7: /* mode I: use FPCR mode.  */
7938       HALT_NYI;
7939
7940     default:
7941       HALT_UNALLOC;
7942     }
7943 }
7944
7945 /* Convert half to float.  */
7946 static void
7947 do_FCVT_half_to_single (sim_cpu *cpu)
7948 {
7949   unsigned rn = INSTR (9, 5);
7950   unsigned rd = INSTR (4, 0);
7951
7952   NYI_assert (31, 10, 0x7B890);
7953
7954   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7955   aarch64_set_FP_float (cpu, rd, (float) aarch64_get_FP_half  (cpu, rn));
7956 }
7957
7958 /* Convert half to double.  */
7959 static void
7960 do_FCVT_half_to_double (sim_cpu *cpu)
7961 {
7962   unsigned rn = INSTR (9, 5);
7963   unsigned rd = INSTR (4, 0);
7964
7965   NYI_assert (31, 10, 0x7B8B0);
7966
7967   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7968   aarch64_set_FP_double (cpu, rd, (double) aarch64_get_FP_half  (cpu, rn));
7969 }
7970
7971 static void
7972 do_FCVT_single_to_half (sim_cpu *cpu)
7973 {
7974   unsigned rn = INSTR (9, 5);
7975   unsigned rd = INSTR (4, 0);
7976
7977   NYI_assert (31, 10, 0x788F0);
7978
7979   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7980   aarch64_set_FP_half (cpu, rd, aarch64_get_FP_float  (cpu, rn));
7981 }
7982
7983 /* Convert double to half.  */
7984 static void
7985 do_FCVT_double_to_half (sim_cpu *cpu)
7986 {
7987   unsigned rn = INSTR (9, 5);
7988   unsigned rd = INSTR (4, 0);
7989
7990   NYI_assert (31, 10, 0x798F0);
7991
7992   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7993   aarch64_set_FP_half (cpu, rd, (float) aarch64_get_FP_double  (cpu, rn));
7994 }
7995
7996 static void
7997 dexSimpleFPDataProc1Source (sim_cpu *cpu)
7998 {
7999   /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
8000      instr[30]    = 0
8001      instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
8002      instr[28,25] = 1111
8003      instr[24]    = 0
8004      instr[23,22] ==> type : 00 ==> source is single,
8005                              01 ==> source is double
8006                              10 ==> UNALLOC
8007                              11 ==> UNALLOC or source is half
8008      instr[21]    = 1
8009      instr[20,15] ==> opcode : with type 00 or 01
8010                                000000 ==> FMOV, 000001 ==> FABS,
8011                                000010 ==> FNEG, 000011 ==> FSQRT,
8012                                000100 ==> UNALLOC, 000101 ==> FCVT,(to single/double)
8013                                000110 ==> UNALLOC, 000111 ==> FCVT (to half)
8014                                001000 ==> FRINTN, 001001 ==> FRINTP,
8015                                001010 ==> FRINTM, 001011 ==> FRINTZ,
8016                                001100 ==> FRINTA, 001101 ==> UNALLOC
8017                                001110 ==> FRINTX, 001111 ==> FRINTI
8018                                with type 11
8019                                000100 ==> FCVT (half-to-single)
8020                                000101 ==> FCVT (half-to-double)
8021                                instr[14,10] = 10000.  */
8022
8023   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
8024   uint32_t type   = INSTR (23, 22);
8025   uint32_t opcode = INSTR (20, 15);
8026
8027   if (M_S != 0)
8028     HALT_UNALLOC;
8029
8030   if (type == 3)
8031     {
8032       if (opcode == 4)
8033         do_FCVT_half_to_single (cpu);
8034       else if (opcode == 5)
8035         do_FCVT_half_to_double (cpu);
8036       else
8037         HALT_UNALLOC;
8038       return;
8039     }
8040
8041   if (type == 2)
8042     HALT_UNALLOC;
8043
8044   switch (opcode)
8045     {
8046     case 0:
8047       if (type)
8048         ffmovd (cpu);
8049       else
8050         ffmovs (cpu);
8051       return;
8052
8053     case 1:
8054       if (type)
8055         fabcpu (cpu);
8056       else
8057         fabss (cpu);
8058       return;
8059
8060     case 2:
8061       if (type)
8062         fnegd (cpu);
8063       else
8064         fnegs (cpu);
8065       return;
8066
8067     case 3:
8068       if (type)
8069         fsqrtd (cpu);
8070       else
8071         fsqrts (cpu);
8072       return;
8073
8074     case 4:
8075       if (type)
8076         fcvtds (cpu);
8077       else
8078         HALT_UNALLOC;
8079       return;
8080
8081     case 5:
8082       if (type)
8083         HALT_UNALLOC;
8084       fcvtcpu (cpu);
8085       return;
8086
8087     case 8:             /* FRINTN etc.  */
8088     case 9:
8089     case 10:
8090     case 11:
8091     case 12:
8092     case 14:
8093     case 15:
8094        do_FRINT (cpu);
8095        return;
8096
8097     case 7:
8098       if (INSTR (22, 22))
8099         do_FCVT_double_to_half (cpu);
8100       else
8101         do_FCVT_single_to_half (cpu);
8102       return;
8103
8104     case 13:
8105       HALT_NYI;
8106
8107     default:
8108       HALT_UNALLOC;
8109     }
8110 }
8111
8112 /* 32 bit signed int to float.  */
8113 static void
8114 scvtf32 (sim_cpu *cpu)
8115 {
8116   unsigned rn = INSTR (9, 5);
8117   unsigned sd = INSTR (4, 0);
8118
8119   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8120   aarch64_set_FP_float
8121     (cpu, sd, (float) aarch64_get_reg_s32 (cpu, rn, NO_SP));
8122 }
8123
8124 /* signed int to float.  */
8125 static void
8126 scvtf (sim_cpu *cpu)
8127 {
8128   unsigned rn = INSTR (9, 5);
8129   unsigned sd = INSTR (4, 0);
8130
8131   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8132   aarch64_set_FP_float
8133     (cpu, sd, (float) aarch64_get_reg_s64 (cpu, rn, NO_SP));
8134 }
8135
8136 /* 32 bit signed int to double.  */
8137 static void
8138 scvtd32 (sim_cpu *cpu)
8139 {
8140   unsigned rn = INSTR (9, 5);
8141   unsigned sd = INSTR (4, 0);
8142
8143   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8144   aarch64_set_FP_double
8145     (cpu, sd, (double) aarch64_get_reg_s32 (cpu, rn, NO_SP));
8146 }
8147
8148 /* signed int to double.  */
8149 static void
8150 scvtd (sim_cpu *cpu)
8151 {
8152   unsigned rn = INSTR (9, 5);
8153   unsigned sd = INSTR (4, 0);
8154
8155   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8156   aarch64_set_FP_double
8157     (cpu, sd, (double) aarch64_get_reg_s64 (cpu, rn, NO_SP));
8158 }
8159
8160 static const float  FLOAT_INT_MAX   = (float)  INT_MAX;
8161 static const float  FLOAT_INT_MIN   = (float)  INT_MIN;
8162 static const double DOUBLE_INT_MAX  = (double) INT_MAX;
8163 static const double DOUBLE_INT_MIN  = (double) INT_MIN;
8164 static const float  FLOAT_LONG_MAX  = (float)  LONG_MAX;
8165 static const float  FLOAT_LONG_MIN  = (float)  LONG_MIN;
8166 static const double DOUBLE_LONG_MAX = (double) LONG_MAX;
8167 static const double DOUBLE_LONG_MIN = (double) LONG_MIN;
8168
8169 #define UINT_MIN 0
8170 #define ULONG_MIN 0
8171 static const float  FLOAT_UINT_MAX   = (float)  UINT_MAX;
8172 static const float  FLOAT_UINT_MIN   = (float)  UINT_MIN;
8173 static const double DOUBLE_UINT_MAX  = (double) UINT_MAX;
8174 static const double DOUBLE_UINT_MIN  = (double) UINT_MIN;
8175 static const float  FLOAT_ULONG_MAX  = (float)  ULONG_MAX;
8176 static const float  FLOAT_ULONG_MIN  = (float)  ULONG_MIN;
8177 static const double DOUBLE_ULONG_MAX = (double) ULONG_MAX;
8178 static const double DOUBLE_ULONG_MIN = (double) ULONG_MIN;
8179
8180 /* Check for FP exception conditions:
8181      NaN raises IO
8182      Infinity raises IO
8183      Out of Range raises IO and IX and saturates value
8184      Denormal raises ID and IX and sets to zero.  */
8185 #define RAISE_EXCEPTIONS(F, VALUE, FTYPE, ITYPE)        \
8186   do                                                    \
8187     {                                                   \
8188       switch (fpclassify (F))                           \
8189         {                                               \
8190         case FP_INFINITE:                               \
8191         case FP_NAN:                                    \
8192           aarch64_set_FPSR (cpu, IO);                   \
8193           if (signbit (F))                              \
8194             VALUE = ITYPE##_MAX;                        \
8195           else                                          \
8196             VALUE = ITYPE##_MIN;                        \
8197           break;                                        \
8198                                                         \
8199         case FP_NORMAL:                                 \
8200           if (F >= FTYPE##_##ITYPE##_MAX)               \
8201             {                                           \
8202               aarch64_set_FPSR_bits (cpu, IO | IX, IO | IX);    \
8203               VALUE = ITYPE##_MAX;                      \
8204             }                                           \
8205           else if (F <= FTYPE##_##ITYPE##_MIN)          \
8206             {                                           \
8207               aarch64_set_FPSR_bits (cpu, IO | IX, IO | IX);    \
8208               VALUE = ITYPE##_MIN;                      \
8209             }                                           \
8210           break;                                        \
8211                                                         \
8212         case FP_SUBNORMAL:                              \
8213           aarch64_set_FPSR_bits (cpu, IO | IX | ID, IX | ID);   \
8214           VALUE = 0;                                    \
8215           break;                                        \
8216                                                         \
8217         default:                                        \
8218         case FP_ZERO:                                   \
8219           VALUE = 0;                                    \
8220           break;                                        \
8221         }                                               \
8222     }                                                   \
8223   while (0)
8224
8225 /* 32 bit convert float to signed int truncate towards zero.  */
8226 static void
8227 fcvtszs32 (sim_cpu *cpu)
8228 {
8229   unsigned sn = INSTR (9, 5);
8230   unsigned rd = INSTR (4, 0);
8231   /* TODO : check that this rounds toward zero.  */
8232   float   f = aarch64_get_FP_float (cpu, sn);
8233   int32_t value = (int32_t) f;
8234
8235   RAISE_EXCEPTIONS (f, value, FLOAT, INT);
8236
8237   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8238   /* Avoid sign extension to 64 bit.  */
8239   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
8240 }
8241
8242 /* 64 bit convert float to signed int truncate towards zero.  */
8243 static void
8244 fcvtszs (sim_cpu *cpu)
8245 {
8246   unsigned sn = INSTR (9, 5);
8247   unsigned rd = INSTR (4, 0);
8248   float f = aarch64_get_FP_float (cpu, sn);
8249   int64_t value = (int64_t) f;
8250
8251   RAISE_EXCEPTIONS (f, value, FLOAT, LONG);
8252
8253   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8254   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
8255 }
8256
8257 /* 32 bit convert double to signed int truncate towards zero.  */
8258 static void
8259 fcvtszd32 (sim_cpu *cpu)
8260 {
8261   unsigned sn = INSTR (9, 5);
8262   unsigned rd = INSTR (4, 0);
8263   /* TODO : check that this rounds toward zero.  */
8264   double   d = aarch64_get_FP_double (cpu, sn);
8265   int32_t  value = (int32_t) d;
8266
8267   RAISE_EXCEPTIONS (d, value, DOUBLE, INT);
8268
8269   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8270   /* Avoid sign extension to 64 bit.  */
8271   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
8272 }
8273
8274 /* 64 bit convert double to signed int truncate towards zero.  */
8275 static void
8276 fcvtszd (sim_cpu *cpu)
8277 {
8278   unsigned sn = INSTR (9, 5);
8279   unsigned rd = INSTR (4, 0);
8280   /* TODO : check that this rounds toward zero.  */
8281   double  d = aarch64_get_FP_double (cpu, sn);
8282   int64_t value;
8283
8284   value = (int64_t) d;
8285
8286   RAISE_EXCEPTIONS (d, value, DOUBLE, LONG);
8287
8288   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8289   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
8290 }
8291
8292 static void
8293 do_fcvtzu (sim_cpu *cpu)
8294 {
8295   /* instr[31]    = size: 32-bit (0), 64-bit (1)
8296      instr[30,23] = 00111100
8297      instr[22]    = type: single (0)/ double (1)
8298      instr[21]    = enable (0)/disable(1) precision
8299      instr[20,16] = 11001
8300      instr[15,10] = precision
8301      instr[9,5]   = Rs
8302      instr[4,0]   = Rd.  */
8303
8304   unsigned rs = INSTR (9, 5);
8305   unsigned rd = INSTR (4, 0);
8306
8307   NYI_assert (30, 23, 0x3C);
8308   NYI_assert (20, 16, 0x19);
8309
8310   if (INSTR (21, 21) != 1)
8311     /* Convert to fixed point.  */
8312     HALT_NYI;
8313
8314   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8315   if (INSTR (31, 31))
8316     {
8317       /* Convert to unsigned 64-bit integer.  */
8318       if (INSTR (22, 22))
8319         {
8320           double  d = aarch64_get_FP_double (cpu, rs);
8321           uint64_t value = (uint64_t) d;
8322
8323           /* Do not raise an exception if we have reached ULONG_MAX.  */
8324           if (value != (1UL << 63))
8325             RAISE_EXCEPTIONS (d, value, DOUBLE, ULONG);
8326
8327           aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
8328         }
8329       else
8330         {
8331           float  f = aarch64_get_FP_float (cpu, rs);
8332           uint64_t value = (uint64_t) f;
8333
8334           /* Do not raise an exception if we have reached ULONG_MAX.  */
8335           if (value != (1UL << 63))
8336             RAISE_EXCEPTIONS (f, value, FLOAT, ULONG);
8337
8338           aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
8339         }
8340     }
8341   else
8342     {
8343       uint32_t value;
8344
8345       /* Convert to unsigned 32-bit integer.  */
8346       if (INSTR (22, 22))
8347         {
8348           double  d = aarch64_get_FP_double (cpu, rs);
8349
8350           value = (uint32_t) d;
8351           /* Do not raise an exception if we have reached UINT_MAX.  */
8352           if (value != (1UL << 31))
8353             RAISE_EXCEPTIONS (d, value, DOUBLE, UINT);
8354         }
8355       else
8356         {
8357           float  f = aarch64_get_FP_float (cpu, rs);
8358
8359           value = (uint32_t) f;
8360           /* Do not raise an exception if we have reached UINT_MAX.  */
8361           if (value != (1UL << 31))
8362             RAISE_EXCEPTIONS (f, value, FLOAT, UINT);
8363         }
8364
8365       aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
8366     }
8367 }
8368
8369 static void
8370 do_UCVTF (sim_cpu *cpu)
8371 {
8372   /* instr[31]    = size: 32-bit (0), 64-bit (1)
8373      instr[30,23] = 001 1110 0
8374      instr[22]    = type: single (0)/ double (1)
8375      instr[21]    = enable (0)/disable(1) precision
8376      instr[20,16] = 0 0011
8377      instr[15,10] = precision
8378      instr[9,5]   = Rs
8379      instr[4,0]   = Rd.  */
8380
8381   unsigned rs = INSTR (9, 5);
8382   unsigned rd = INSTR (4, 0);
8383
8384   NYI_assert (30, 23, 0x3C);
8385   NYI_assert (20, 16, 0x03);
8386
8387   if (INSTR (21, 21) != 1)
8388     HALT_NYI;
8389
8390   /* FIXME: Add exception raising.  */
8391   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8392   if (INSTR (31, 31))
8393     {
8394       uint64_t value = aarch64_get_reg_u64 (cpu, rs, NO_SP);
8395
8396       if (INSTR (22, 22))
8397         aarch64_set_FP_double (cpu, rd, (double) value);
8398       else
8399         aarch64_set_FP_float (cpu, rd, (float) value);
8400     }
8401   else
8402     {
8403       uint32_t value =  aarch64_get_reg_u32 (cpu, rs, NO_SP);
8404
8405       if (INSTR (22, 22))
8406         aarch64_set_FP_double (cpu, rd, (double) value);
8407       else
8408         aarch64_set_FP_float (cpu, rd, (float) value);
8409     }
8410 }
8411
8412 static void
8413 float_vector_move (sim_cpu *cpu)
8414 {
8415   /* instr[31,17] == 100 1111 0101 0111
8416      instr[16]    ==> direction 0=> to GR, 1=> from GR
8417      instr[15,10] => ???
8418      instr[9,5]   ==> source
8419      instr[4,0]   ==> dest.  */
8420
8421   unsigned rn = INSTR (9, 5);
8422   unsigned rd = INSTR (4, 0);
8423
8424   NYI_assert (31, 17, 0x4F57);
8425
8426   if (INSTR (15, 10) != 0)
8427     HALT_UNALLOC;
8428
8429   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8430   if (INSTR (16, 16))
8431     aarch64_set_vec_u64 (cpu, rd, 1, aarch64_get_reg_u64 (cpu, rn, NO_SP));
8432   else
8433     aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u64 (cpu, rn, 1));
8434 }
8435
8436 static void
8437 dexSimpleFPIntegerConvert (sim_cpu *cpu)
8438 {
8439   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
8440      instr[30     = 0
8441      instr[29]    = S :  0 ==> OK, 1 ==> UNALLOC
8442      instr[28,25] = 1111
8443      instr[24]    = 0
8444      instr[23,22] = type : 00 ==> single, 01 ==> double, 1x ==> UNALLOC
8445      instr[21]    = 1
8446      instr[20,19] = rmode
8447      instr[18,16] = opcode
8448      instr[15,10] = 10 0000  */
8449
8450   uint32_t rmode_opcode;
8451   uint32_t size_type;
8452   uint32_t type;
8453   uint32_t size;
8454   uint32_t S;
8455
8456   if (INSTR (31, 17) == 0x4F57)
8457     {
8458       float_vector_move (cpu);
8459       return;
8460     }
8461
8462   size = INSTR (31, 31);
8463   S = INSTR (29, 29);
8464   if (S != 0)
8465     HALT_UNALLOC;
8466
8467   type = INSTR (23, 22);
8468   if (type > 1)
8469     HALT_UNALLOC;
8470
8471   rmode_opcode = INSTR (20, 16);
8472   size_type = (size << 1) | type; /* 0==32f, 1==32d, 2==64f, 3==64d.  */
8473
8474   switch (rmode_opcode)
8475     {
8476     case 2:                     /* SCVTF.  */
8477       switch (size_type)
8478         {
8479         case 0: scvtf32 (cpu); return;
8480         case 1: scvtd32 (cpu); return;
8481         case 2: scvtf (cpu); return;
8482         case 3: scvtd (cpu); return;
8483         }
8484
8485     case 6:                     /* FMOV GR, Vec.  */
8486       switch (size_type)
8487         {
8488         case 0:  gfmovs (cpu); return;
8489         case 3:  gfmovd (cpu); return;
8490         default: HALT_UNALLOC;
8491         }
8492
8493     case 7:                     /* FMOV vec, GR.  */
8494       switch (size_type)
8495         {
8496         case 0:  fgmovs (cpu); return;
8497         case 3:  fgmovd (cpu); return;
8498         default: HALT_UNALLOC;
8499         }
8500
8501     case 24:                    /* FCVTZS.  */
8502       switch (size_type)
8503         {
8504         case 0: fcvtszs32 (cpu); return;
8505         case 1: fcvtszd32 (cpu); return;
8506         case 2: fcvtszs (cpu); return;
8507         case 3: fcvtszd (cpu); return;
8508         }
8509
8510     case 25: do_fcvtzu (cpu); return;
8511     case 3:  do_UCVTF (cpu); return;
8512
8513     case 0:     /* FCVTNS.  */
8514     case 1:     /* FCVTNU.  */
8515     case 4:     /* FCVTAS.  */
8516     case 5:     /* FCVTAU.  */
8517     case 8:     /* FCVPTS.  */
8518     case 9:     /* FCVTPU.  */
8519     case 16:    /* FCVTMS.  */
8520     case 17:    /* FCVTMU.  */
8521     default:
8522       HALT_NYI;
8523     }
8524 }
8525
8526 static void
8527 set_flags_for_float_compare (sim_cpu *cpu, float fvalue1, float fvalue2)
8528 {
8529   uint32_t flags;
8530
8531   /* FIXME: Add exception raising.  */
8532   if (isnan (fvalue1) || isnan (fvalue2))
8533     flags = C|V;
8534   else if (isinf (fvalue1) && isinf (fvalue2))
8535     {
8536       /* Subtracting two infinities may give a NaN.  We only need to compare
8537          the signs, which we can get from isinf.  */
8538       int result = isinf (fvalue1) - isinf (fvalue2);
8539
8540       if (result == 0)
8541         flags = Z|C;
8542       else if (result < 0)
8543         flags = N;
8544       else /* (result > 0).  */
8545         flags = C;
8546     }
8547   else
8548     {
8549       float result = fvalue1 - fvalue2;
8550
8551       if (result == 0.0)
8552         flags = Z|C;
8553       else if (result < 0)
8554         flags = N;
8555       else /* (result > 0).  */
8556         flags = C;
8557     }
8558
8559   aarch64_set_CPSR (cpu, flags);
8560 }
8561
8562 static void
8563 fcmps (sim_cpu *cpu)
8564 {
8565   unsigned sm = INSTR (20, 16);
8566   unsigned sn = INSTR ( 9,  5);
8567
8568   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8569   float fvalue2 = aarch64_get_FP_float (cpu, sm);
8570
8571   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8572   set_flags_for_float_compare (cpu, fvalue1, fvalue2);
8573 }
8574
8575 /* Float compare to zero -- Invalid Operation exception
8576    only on signaling NaNs.  */
8577 static void
8578 fcmpzs (sim_cpu *cpu)
8579 {
8580   unsigned sn = INSTR ( 9,  5);
8581   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8582
8583   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8584   set_flags_for_float_compare (cpu, fvalue1, 0.0f);
8585 }
8586
8587 /* Float compare -- Invalid Operation exception on all NaNs.  */
8588 static void
8589 fcmpes (sim_cpu *cpu)
8590 {
8591   unsigned sm = INSTR (20, 16);
8592   unsigned sn = INSTR ( 9,  5);
8593
8594   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8595   float fvalue2 = aarch64_get_FP_float (cpu, sm);
8596
8597   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8598   set_flags_for_float_compare (cpu, fvalue1, fvalue2);
8599 }
8600
8601 /* Float compare to zero -- Invalid Operation exception on all NaNs.  */
8602 static void
8603 fcmpzes (sim_cpu *cpu)
8604 {
8605   unsigned sn = INSTR ( 9,  5);
8606   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8607
8608   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8609   set_flags_for_float_compare (cpu, fvalue1, 0.0f);
8610 }
8611
8612 static void
8613 set_flags_for_double_compare (sim_cpu *cpu, double dval1, double dval2)
8614 {
8615   uint32_t flags;
8616
8617   /* FIXME: Add exception raising.  */
8618   if (isnan (dval1) || isnan (dval2))
8619     flags = C|V;
8620   else if (isinf (dval1) && isinf (dval2))
8621     {
8622       /* Subtracting two infinities may give a NaN.  We only need to compare
8623          the signs, which we can get from isinf.  */
8624       int result = isinf (dval1) - isinf (dval2);
8625
8626       if (result == 0)
8627         flags = Z|C;
8628       else if (result < 0)
8629         flags = N;
8630       else /* (result > 0).  */
8631         flags = C;
8632     }
8633   else
8634     {
8635       double result = dval1 - dval2;
8636
8637       if (result == 0.0)
8638         flags = Z|C;
8639       else if (result < 0)
8640         flags = N;
8641       else /* (result > 0).  */
8642         flags = C;
8643     }
8644
8645   aarch64_set_CPSR (cpu, flags);
8646 }
8647
8648 /* Double compare -- Invalid Operation exception only on signaling NaNs.  */
8649 static void
8650 fcmpd (sim_cpu *cpu)
8651 {
8652   unsigned sm = INSTR (20, 16);
8653   unsigned sn = INSTR ( 9,  5);
8654
8655   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8656   double dvalue2 = aarch64_get_FP_double (cpu, sm);
8657
8658   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8659   set_flags_for_double_compare (cpu, dvalue1, dvalue2);
8660 }
8661
8662 /* Double compare to zero -- Invalid Operation exception
8663    only on signaling NaNs.  */
8664 static void
8665 fcmpzd (sim_cpu *cpu)
8666 {
8667   unsigned sn = INSTR ( 9,  5);
8668   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8669
8670   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8671   set_flags_for_double_compare (cpu, dvalue1, 0.0);
8672 }
8673
8674 /* Double compare -- Invalid Operation exception on all NaNs.  */
8675 static void
8676 fcmped (sim_cpu *cpu)
8677 {
8678   unsigned sm = INSTR (20, 16);
8679   unsigned sn = INSTR ( 9,  5);
8680
8681   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8682   double dvalue2 = aarch64_get_FP_double (cpu, sm);
8683
8684   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8685   set_flags_for_double_compare (cpu, dvalue1, dvalue2);
8686 }
8687
8688 /* Double compare to zero -- Invalid Operation exception on all NaNs.  */
8689 static void
8690 fcmpzed (sim_cpu *cpu)
8691 {
8692   unsigned sn = INSTR ( 9,  5);
8693   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8694
8695   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8696   set_flags_for_double_compare (cpu, dvalue1, 0.0);
8697 }
8698
8699 static void
8700 dexSimpleFPCompare (sim_cpu *cpu)
8701 {
8702   /* assert instr[28,25] == 1111
8703      instr[30:24:21:13,10] = 0011000
8704      instr[31] = M : 0 ==> OK, 1 ==> UNALLOC
8705      instr[29] ==> S :  0 ==> OK, 1 ==> UNALLOC
8706      instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
8707      instr[15,14] ==> op : 00 ==> OK, ow ==> UNALLOC
8708      instr[4,0] ==> opcode2 : 00000 ==> FCMP, 10000 ==> FCMPE,
8709                               01000 ==> FCMPZ, 11000 ==> FCMPEZ,
8710                               ow ==> UNALLOC  */
8711   uint32_t dispatch;
8712   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
8713   uint32_t type = INSTR (23, 22);
8714   uint32_t op = INSTR (15, 14);
8715   uint32_t op2_2_0 = INSTR (2, 0);
8716
8717   if (op2_2_0 != 0)
8718     HALT_UNALLOC;
8719
8720   if (M_S != 0)
8721     HALT_UNALLOC;
8722
8723   if (type > 1)
8724     HALT_UNALLOC;
8725
8726   if (op != 0)
8727     HALT_UNALLOC;
8728
8729   /* dispatch on type and top 2 bits of opcode.  */
8730   dispatch = (type << 2) | INSTR (4, 3);
8731
8732   switch (dispatch)
8733     {
8734     case 0: fcmps (cpu); return;
8735     case 1: fcmpzs (cpu); return;
8736     case 2: fcmpes (cpu); return;
8737     case 3: fcmpzes (cpu); return;
8738     case 4: fcmpd (cpu); return;
8739     case 5: fcmpzd (cpu); return;
8740     case 6: fcmped (cpu); return;
8741     case 7: fcmpzed (cpu); return;
8742     }
8743 }
8744
8745 static void
8746 do_scalar_FADDP (sim_cpu *cpu)
8747 {
8748   /* instr [31,23] = 0111 1110 0
8749      instr [22]    = single(0)/double(1)
8750      instr [21,10] = 11 0000 1101 10
8751      instr [9,5]   = Fn
8752      instr [4,0]   = Fd.  */
8753
8754   unsigned Fn = INSTR (9, 5);
8755   unsigned Fd = INSTR (4, 0);
8756
8757   NYI_assert (31, 23, 0x0FC);
8758   NYI_assert (21, 10, 0xC36);
8759
8760   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8761   if (INSTR (22, 22))
8762     {
8763       double val1 = aarch64_get_vec_double (cpu, Fn, 0);
8764       double val2 = aarch64_get_vec_double (cpu, Fn, 1);
8765
8766       aarch64_set_FP_double (cpu, Fd, val1 + val2);
8767     }
8768   else
8769     {
8770       float val1 = aarch64_get_vec_float (cpu, Fn, 0);
8771       float val2 = aarch64_get_vec_float (cpu, Fn, 1);
8772
8773       aarch64_set_FP_float (cpu, Fd, val1 + val2);
8774     }
8775 }
8776
8777 /* Floating point absolute difference.  */
8778
8779 static void
8780 do_scalar_FABD (sim_cpu *cpu)
8781 {
8782   /* instr [31,23] = 0111 1110 1
8783      instr [22]    = float(0)/double(1)
8784      instr [21]    = 1
8785      instr [20,16] = Rm
8786      instr [15,10] = 1101 01
8787      instr [9, 5]  = Rn
8788      instr [4, 0]  = Rd.  */
8789
8790   unsigned rm = INSTR (20, 16);
8791   unsigned rn = INSTR (9, 5);
8792   unsigned rd = INSTR (4, 0);
8793
8794   NYI_assert (31, 23, 0x0FD);
8795   NYI_assert (21, 21, 1);
8796   NYI_assert (15, 10, 0x35);
8797
8798   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8799   if (INSTR (22, 22))
8800     aarch64_set_FP_double (cpu, rd,
8801                            fabs (aarch64_get_FP_double (cpu, rn)
8802                                  - aarch64_get_FP_double (cpu, rm)));
8803   else
8804     aarch64_set_FP_float (cpu, rd,
8805                           fabsf (aarch64_get_FP_float (cpu, rn)
8806                                  - aarch64_get_FP_float (cpu, rm)));
8807 }
8808
8809 static void
8810 do_scalar_CMGT (sim_cpu *cpu)
8811 {
8812   /* instr [31,21] = 0101 1110 111
8813      instr [20,16] = Rm
8814      instr [15,10] = 00 1101
8815      instr [9, 5]  = Rn
8816      instr [4, 0]  = Rd.  */
8817
8818   unsigned rm = INSTR (20, 16);
8819   unsigned rn = INSTR (9, 5);
8820   unsigned rd = INSTR (4, 0);
8821
8822   NYI_assert (31, 21, 0x2F7);
8823   NYI_assert (15, 10, 0x0D);
8824
8825   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8826   aarch64_set_vec_u64 (cpu, rd, 0,
8827                        aarch64_get_vec_u64 (cpu, rn, 0) >
8828                        aarch64_get_vec_u64 (cpu, rm, 0) ? -1L : 0L);
8829 }
8830
8831 static void
8832 do_scalar_USHR (sim_cpu *cpu)
8833 {
8834   /* instr [31,23] = 0111 1111 0
8835      instr [22,16] = shift amount
8836      instr [15,10] = 0000 01
8837      instr [9, 5]  = Rn
8838      instr [4, 0]  = Rd.  */
8839
8840   unsigned amount = 128 - INSTR (22, 16);
8841   unsigned rn = INSTR (9, 5);
8842   unsigned rd = INSTR (4, 0);
8843
8844   NYI_assert (31, 23, 0x0FE);
8845   NYI_assert (15, 10, 0x01);
8846
8847   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8848   aarch64_set_vec_u64 (cpu, rd, 0,
8849                        aarch64_get_vec_u64 (cpu, rn, 0) >> amount);
8850 }
8851
8852 static void
8853 do_scalar_SSHL (sim_cpu *cpu)
8854 {
8855   /* instr [31,21] = 0101 1110 111
8856      instr [20,16] = Rm
8857      instr [15,10] = 0100 01
8858      instr [9, 5]  = Rn
8859      instr [4, 0]  = Rd.  */
8860
8861   unsigned rm = INSTR (20, 16);
8862   unsigned rn = INSTR (9, 5);
8863   unsigned rd = INSTR (4, 0);
8864   signed int shift = aarch64_get_vec_s8 (cpu, rm, 0);
8865
8866   NYI_assert (31, 21, 0x2F7);
8867   NYI_assert (15, 10, 0x11);
8868
8869   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8870   if (shift >= 0)
8871     aarch64_set_vec_s64 (cpu, rd, 0,
8872                          aarch64_get_vec_s64 (cpu, rn, 0) << shift);
8873   else
8874     aarch64_set_vec_s64 (cpu, rd, 0,
8875                          aarch64_get_vec_s64 (cpu, rn, 0) >> - shift);
8876 }
8877
8878 static void
8879 do_scalar_shift (sim_cpu *cpu)
8880 {
8881   /* instr [31,23] = 0101 1111 0
8882      instr [22,16] = shift amount
8883      instr [15,10] = 0101 01   [SHL]
8884      instr [15,10] = 0000 01   [SSHR]
8885      instr [9, 5]  = Rn
8886      instr [4, 0]  = Rd.  */
8887
8888   unsigned rn = INSTR (9, 5);
8889   unsigned rd = INSTR (4, 0);
8890   unsigned amount;
8891
8892   NYI_assert (31, 23, 0x0BE);
8893
8894   if (INSTR (22, 22) == 0)
8895     HALT_UNALLOC;
8896
8897   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8898   switch (INSTR (15, 10))
8899     {
8900     case 0x01: /* SSHR */
8901       amount = 128 - INSTR (22, 16);
8902       aarch64_set_vec_s64 (cpu, rd, 0,
8903                            aarch64_get_vec_s64 (cpu, rn, 0) >> amount);
8904       return;
8905     case 0x15: /* SHL */
8906       amount = INSTR (22, 16) - 64;
8907       aarch64_set_vec_u64 (cpu, rd, 0,
8908                            aarch64_get_vec_u64 (cpu, rn, 0) << amount);
8909       return;
8910     default:
8911       HALT_NYI;
8912     }
8913 }
8914
8915 /* FCMEQ FCMGT FCMGE.  */
8916 static void
8917 do_scalar_FCM (sim_cpu *cpu)
8918 {
8919   /* instr [31,30] = 01
8920      instr [29]    = U
8921      instr [28,24] = 1 1110
8922      instr [23]    = E
8923      instr [22]    = size
8924      instr [21]    = 1
8925      instr [20,16] = Rm
8926      instr [15,12] = 1110
8927      instr [11]    = AC
8928      instr [10]    = 1
8929      instr [9, 5]  = Rn
8930      instr [4, 0]  = Rd.  */
8931
8932   unsigned rm = INSTR (20, 16);
8933   unsigned rn = INSTR (9, 5);
8934   unsigned rd = INSTR (4, 0);
8935   unsigned EUac = (INSTR (23, 23) << 2) | (INSTR (29, 29) << 1) | INSTR (11, 11);
8936   unsigned result;
8937   float val1;
8938   float val2;
8939
8940   NYI_assert (31, 30, 1);
8941   NYI_assert (28, 24, 0x1E);
8942   NYI_assert (21, 21, 1);
8943   NYI_assert (15, 12, 0xE);
8944   NYI_assert (10, 10, 1);
8945
8946   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8947   if (INSTR (22, 22))
8948     {
8949       double val1 = aarch64_get_FP_double (cpu, rn);
8950       double val2 = aarch64_get_FP_double (cpu, rm);
8951
8952       switch (EUac)
8953         {
8954         case 0: /* 000 */
8955           result = val1 == val2;
8956           break;
8957
8958         case 3: /* 011 */
8959           val1 = fabs (val1);
8960           val2 = fabs (val2);
8961           /* Fall through. */
8962         case 2: /* 010 */
8963           result = val1 >= val2;
8964           break;
8965
8966         case 7: /* 111 */
8967           val1 = fabs (val1);
8968           val2 = fabs (val2);
8969           /* Fall through. */
8970         case 6: /* 110 */
8971           result = val1 > val2;
8972           break;
8973
8974         default:
8975           HALT_UNALLOC;
8976         }
8977
8978       aarch64_set_vec_u32 (cpu, rd, 0, result ? -1 : 0);
8979       return;
8980     }
8981
8982   val1 = aarch64_get_FP_float (cpu, rn);
8983   val2 = aarch64_get_FP_float (cpu, rm);
8984
8985   switch (EUac)
8986     {
8987     case 0: /* 000 */
8988       result = val1 == val2;
8989       break;
8990
8991     case 3: /* 011 */
8992       val1 = fabsf (val1);
8993       val2 = fabsf (val2);
8994       /* Fall through. */
8995     case 2: /* 010 */
8996       result = val1 >= val2;
8997       break;
8998
8999     case 7: /* 111 */
9000       val1 = fabsf (val1);
9001       val2 = fabsf (val2);
9002       /* Fall through. */
9003     case 6: /* 110 */
9004       result = val1 > val2;
9005       break;
9006
9007     default:
9008       HALT_UNALLOC;
9009     }
9010
9011   aarch64_set_vec_u32 (cpu, rd, 0, result ? -1 : 0);
9012 }
9013
9014 /* An alias of DUP.  */
9015 static void
9016 do_scalar_MOV (sim_cpu *cpu)
9017 {
9018   /* instr [31,21] = 0101 1110 000
9019      instr [20,16] = imm5
9020      instr [15,10] = 0000 01
9021      instr [9, 5]  = Rn
9022      instr [4, 0]  = Rd.  */
9023
9024   unsigned rn = INSTR (9, 5);
9025   unsigned rd = INSTR (4, 0);
9026   unsigned index;
9027
9028   NYI_assert (31, 21, 0x2F0);
9029   NYI_assert (15, 10, 0x01);
9030
9031   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9032   if (INSTR (16, 16))
9033     {
9034       /* 8-bit.  */
9035       index = INSTR (20, 17);
9036       aarch64_set_vec_u8
9037         (cpu, rd, 0, aarch64_get_vec_u8 (cpu, rn, index));
9038     }
9039   else if (INSTR (17, 17))
9040     {
9041       /* 16-bit.  */
9042       index = INSTR (20, 18);
9043       aarch64_set_vec_u16
9044         (cpu, rd, 0, aarch64_get_vec_u16 (cpu, rn, index));
9045     }
9046   else if (INSTR (18, 18))
9047     {
9048       /* 32-bit.  */
9049       index = INSTR (20, 19);
9050       aarch64_set_vec_u32
9051         (cpu, rd, 0, aarch64_get_vec_u32 (cpu, rn, index));
9052     }
9053   else if (INSTR (19, 19))
9054     {
9055       /* 64-bit.  */
9056       index = INSTR (20, 20);
9057       aarch64_set_vec_u64
9058         (cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, index));
9059     }
9060   else
9061     HALT_UNALLOC;
9062 }
9063
9064 static void
9065 do_scalar_NEG (sim_cpu *cpu)
9066 {
9067   /* instr [31,10] = 0111 1110 1110 0000 1011 10
9068      instr [9, 5]  = Rn
9069      instr [4, 0]  = Rd.  */
9070
9071   unsigned rn = INSTR (9, 5);
9072   unsigned rd = INSTR (4, 0);
9073
9074   NYI_assert (31, 10, 0x1FB82E);
9075
9076   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9077   aarch64_set_vec_u64 (cpu, rd, 0, - aarch64_get_vec_u64 (cpu, rn, 0));
9078 }
9079
9080 static void
9081 do_scalar_USHL (sim_cpu *cpu)
9082 {
9083   /* instr [31,21] = 0111 1110 111
9084      instr [20,16] = Rm
9085      instr [15,10] = 0100 01
9086      instr [9, 5]  = Rn
9087      instr [4, 0]  = Rd.  */
9088
9089   unsigned rm = INSTR (20, 16);
9090   unsigned rn = INSTR (9, 5);
9091   unsigned rd = INSTR (4, 0);
9092   signed int shift = aarch64_get_vec_s8 (cpu, rm, 0);
9093
9094   NYI_assert (31, 21, 0x3F7);
9095   NYI_assert (15, 10, 0x11);
9096
9097   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9098   if (shift >= 0)
9099     aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, 0) << shift);
9100   else
9101     aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, 0) >> - shift);
9102 }
9103
9104 static void
9105 do_double_add (sim_cpu *cpu)
9106 {
9107   /* instr [31,21] = 0101 1110 111
9108      instr [20,16] = Fn
9109      instr [15,10] = 1000 01
9110      instr [9,5]   = Fm
9111      instr [4,0]   = Fd.  */
9112   unsigned Fd;
9113   unsigned Fm;
9114   unsigned Fn;
9115   double val1;
9116   double val2;
9117
9118   NYI_assert (31, 21, 0x2F7);
9119   NYI_assert (15, 10, 0x21);
9120
9121   Fd = INSTR (4, 0);
9122   Fm = INSTR (9, 5);
9123   Fn = INSTR (20, 16);
9124
9125   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9126   val1 = aarch64_get_FP_double (cpu, Fm);
9127   val2 = aarch64_get_FP_double (cpu, Fn);
9128
9129   aarch64_set_FP_double (cpu, Fd, val1 + val2);
9130 }
9131
9132 static void
9133 do_scalar_UCVTF (sim_cpu *cpu)
9134 {
9135   /* instr [31,23] = 0111 1110 0
9136      instr [22]    = single(0)/double(1)
9137      instr [21,10] = 10 0001 1101 10
9138      instr [9,5]   = rn
9139      instr [4,0]   = rd.  */
9140
9141   unsigned rn = INSTR (9, 5);
9142   unsigned rd = INSTR (4, 0);
9143
9144   NYI_assert (31, 23, 0x0FC);
9145   NYI_assert (21, 10, 0x876);
9146
9147   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9148   if (INSTR (22, 22))
9149     {
9150       uint64_t val = aarch64_get_vec_u64 (cpu, rn, 0);
9151
9152       aarch64_set_vec_double (cpu, rd, 0, (double) val);
9153     }
9154   else
9155     {
9156       uint32_t val = aarch64_get_vec_u32 (cpu, rn, 0);
9157
9158       aarch64_set_vec_float (cpu, rd, 0, (float) val);
9159     }
9160 }
9161
9162 static void
9163 do_scalar_vec (sim_cpu *cpu)
9164 {
9165   /* instr [30] = 1.  */
9166   /* instr [28,25] = 1111.  */
9167   switch (INSTR (31, 23))
9168     {
9169     case 0xBC:
9170       switch (INSTR (15, 10))
9171         {
9172         case 0x01: do_scalar_MOV (cpu); return;
9173         case 0x39: do_scalar_FCM (cpu); return;
9174         case 0x3B: do_scalar_FCM (cpu); return;
9175         }
9176       break;
9177
9178     case 0xBE: do_scalar_shift (cpu); return;
9179
9180     case 0xFC:
9181       switch (INSTR (15, 10))
9182         {
9183         case 0x36:
9184           switch (INSTR (21, 16))
9185             {
9186             case 0x30: do_scalar_FADDP (cpu); return;
9187             case 0x21: do_scalar_UCVTF (cpu); return;
9188             }
9189           HALT_NYI;
9190         case 0x39: do_scalar_FCM (cpu); return;
9191         case 0x3B: do_scalar_FCM (cpu); return;
9192         }
9193       break;
9194
9195     case 0xFD:
9196       switch (INSTR (15, 10))
9197         {
9198         case 0x0D: do_scalar_CMGT (cpu); return;
9199         case 0x11: do_scalar_USHL (cpu); return;
9200         case 0x2E: do_scalar_NEG (cpu); return;
9201         case 0x35: do_scalar_FABD (cpu); return;
9202         case 0x39: do_scalar_FCM (cpu); return;
9203         case 0x3B: do_scalar_FCM (cpu); return;
9204         default:
9205           HALT_NYI;
9206         }
9207
9208     case 0xFE: do_scalar_USHR (cpu); return;
9209
9210     case 0xBD:
9211       switch (INSTR (15, 10))
9212         {
9213         case 0x21: do_double_add (cpu); return;
9214         case 0x11: do_scalar_SSHL (cpu); return;
9215         default:
9216           HALT_NYI;
9217         }
9218
9219     default:
9220       HALT_NYI;
9221     }
9222 }
9223
9224 static void
9225 dexAdvSIMD1 (sim_cpu *cpu)
9226 {
9227   /* instr [28,25] = 1 111.  */
9228
9229   /* We are currently only interested in the basic
9230      scalar fp routines which all have bit 30 = 0.  */
9231   if (INSTR (30, 30))
9232     do_scalar_vec (cpu);
9233
9234   /* instr[24] is set for FP data processing 3-source and clear for
9235      all other basic scalar fp instruction groups.  */
9236   else if (INSTR (24, 24))
9237     dexSimpleFPDataProc3Source (cpu);
9238
9239   /* instr[21] is clear for floating <-> fixed conversions and set for
9240      all other basic scalar fp instruction groups.  */
9241   else if (!INSTR (21, 21))
9242     dexSimpleFPFixedConvert (cpu);
9243
9244   /* instr[11,10] : 01 ==> cond compare, 10 ==> Data Proc 2 Source
9245      11 ==> cond select,  00 ==> other.  */
9246   else
9247     switch (INSTR (11, 10))
9248       {
9249       case 1: dexSimpleFPCondCompare (cpu); return;
9250       case 2: dexSimpleFPDataProc2Source (cpu); return;
9251       case 3: dexSimpleFPCondSelect (cpu); return;
9252
9253       default:
9254         /* Now an ordered cascade of tests.
9255            FP immediate has instr [12] == 1.
9256            FP compare has   instr [13] == 1.
9257            FP Data Proc 1 Source has instr [14] == 1.
9258            FP floating <--> integer conversions has instr [15] == 0.  */
9259         if (INSTR (12, 12))
9260           dexSimpleFPImmediate (cpu);
9261
9262         else if (INSTR (13, 13))
9263           dexSimpleFPCompare (cpu);
9264
9265         else if (INSTR (14, 14))
9266           dexSimpleFPDataProc1Source (cpu);
9267
9268         else if (!INSTR (15, 15))
9269           dexSimpleFPIntegerConvert (cpu);
9270
9271         else
9272           /* If we get here then instr[15] == 1 which means UNALLOC.  */
9273           HALT_UNALLOC;
9274       }
9275 }
9276
9277 /* PC relative addressing.  */
9278
9279 static void
9280 pcadr (sim_cpu *cpu)
9281 {
9282   /* instr[31] = op : 0 ==> ADR, 1 ==> ADRP
9283      instr[30,29] = immlo
9284      instr[23,5] = immhi.  */
9285   uint64_t address;
9286   unsigned rd = INSTR (4, 0);
9287   uint32_t isPage = INSTR (31, 31);
9288   union { int64_t u64; uint64_t s64; } imm;
9289   uint64_t offset;
9290
9291   imm.s64 = simm64 (aarch64_get_instr (cpu), 23, 5);
9292   offset = imm.u64;
9293   offset = (offset << 2) | INSTR (30, 29);
9294
9295   address = aarch64_get_PC (cpu);
9296
9297   if (isPage)
9298     {
9299       offset <<= 12;
9300       address &= ~0xfff;
9301     }
9302
9303   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9304   aarch64_set_reg_u64 (cpu, rd, NO_SP, address + offset);
9305 }
9306
9307 /* Specific decode and execute for group Data Processing Immediate.  */
9308
9309 static void
9310 dexPCRelAddressing (sim_cpu *cpu)
9311 {
9312   /* assert instr[28,24] = 10000.  */
9313   pcadr (cpu);
9314 }
9315
9316 /* Immediate logical.
9317    The bimm32/64 argument is constructed by replicating a 2, 4, 8,
9318    16, 32 or 64 bit sequence pulled out at decode and possibly
9319    inverting it..
9320
9321    N.B. the output register (dest) can normally be Xn or SP
9322    the exception occurs for flag setting instructions which may
9323    only use Xn for the output (dest).  The input register can
9324    never be SP.  */
9325
9326 /* 32 bit and immediate.  */
9327 static void
9328 and32 (sim_cpu *cpu, uint32_t bimm)
9329 {
9330   unsigned rn = INSTR (9, 5);
9331   unsigned rd = INSTR (4, 0);
9332
9333   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9334   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9335                        aarch64_get_reg_u32 (cpu, rn, NO_SP) & bimm);
9336 }
9337
9338 /* 64 bit and immediate.  */
9339 static void
9340 and64 (sim_cpu *cpu, uint64_t bimm)
9341 {
9342   unsigned rn = INSTR (9, 5);
9343   unsigned rd = INSTR (4, 0);
9344
9345   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9346   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9347                        aarch64_get_reg_u64 (cpu, rn, NO_SP) & bimm);
9348 }
9349
9350 /* 32 bit and immediate set flags.  */
9351 static void
9352 ands32 (sim_cpu *cpu, uint32_t bimm)
9353 {
9354   unsigned rn = INSTR (9, 5);
9355   unsigned rd = INSTR (4, 0);
9356
9357   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9358   uint32_t value2 = bimm;
9359
9360   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9361   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9362   set_flags_for_binop32 (cpu, value1 & value2);
9363 }
9364
9365 /* 64 bit and immediate set flags.  */
9366 static void
9367 ands64 (sim_cpu *cpu, uint64_t bimm)
9368 {
9369   unsigned rn = INSTR (9, 5);
9370   unsigned rd = INSTR (4, 0);
9371
9372   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9373   uint64_t value2 = bimm;
9374
9375   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9376   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9377   set_flags_for_binop64 (cpu, value1 & value2);
9378 }
9379
9380 /* 32 bit exclusive or immediate.  */
9381 static void
9382 eor32 (sim_cpu *cpu, uint32_t bimm)
9383 {
9384   unsigned rn = INSTR (9, 5);
9385   unsigned rd = INSTR (4, 0);
9386
9387   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9388   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9389                        aarch64_get_reg_u32 (cpu, rn, NO_SP) ^ bimm);
9390 }
9391
9392 /* 64 bit exclusive or immediate.  */
9393 static void
9394 eor64 (sim_cpu *cpu, uint64_t bimm)
9395 {
9396   unsigned rn = INSTR (9, 5);
9397   unsigned rd = INSTR (4, 0);
9398
9399   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9400   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9401                        aarch64_get_reg_u64 (cpu, rn, NO_SP) ^ bimm);
9402 }
9403
9404 /* 32 bit or immediate.  */
9405 static void
9406 orr32 (sim_cpu *cpu, uint32_t bimm)
9407 {
9408   unsigned rn = INSTR (9, 5);
9409   unsigned rd = INSTR (4, 0);
9410
9411   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9412   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9413                        aarch64_get_reg_u32 (cpu, rn, NO_SP) | bimm);
9414 }
9415
9416 /* 64 bit or immediate.  */
9417 static void
9418 orr64 (sim_cpu *cpu, uint64_t bimm)
9419 {
9420   unsigned rn = INSTR (9, 5);
9421   unsigned rd = INSTR (4, 0);
9422
9423   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9424   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9425                        aarch64_get_reg_u64 (cpu, rn, NO_SP) | bimm);
9426 }
9427
9428 /* Logical shifted register.
9429    These allow an optional LSL, ASR, LSR or ROR to the second source
9430    register with a count up to the register bit count.
9431    N.B register args may not be SP.  */
9432
9433 /* 32 bit AND shifted register.  */
9434 static void
9435 and32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9436 {
9437   unsigned rm = INSTR (20, 16);
9438   unsigned rn = INSTR (9, 5);
9439   unsigned rd = INSTR (4, 0);
9440
9441   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9442   aarch64_set_reg_u64
9443     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9444      & shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9445 }
9446
9447 /* 64 bit AND shifted register.  */
9448 static void
9449 and64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9450 {
9451   unsigned rm = INSTR (20, 16);
9452   unsigned rn = INSTR (9, 5);
9453   unsigned rd = INSTR (4, 0);
9454
9455   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9456   aarch64_set_reg_u64
9457     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9458      & shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9459 }
9460
9461 /* 32 bit AND shifted register setting flags.  */
9462 static void
9463 ands32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9464 {
9465   unsigned rm = INSTR (20, 16);
9466   unsigned rn = INSTR (9, 5);
9467   unsigned rd = INSTR (4, 0);
9468
9469   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9470   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
9471                                shift, count);
9472
9473   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9474   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9475   set_flags_for_binop32 (cpu, value1 & value2);
9476 }
9477
9478 /* 64 bit AND shifted register setting flags.  */
9479 static void
9480 ands64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9481 {
9482   unsigned rm = INSTR (20, 16);
9483   unsigned rn = INSTR (9, 5);
9484   unsigned rd = INSTR (4, 0);
9485
9486   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9487   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
9488                                shift, count);
9489
9490   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9491   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9492   set_flags_for_binop64 (cpu, value1 & value2);
9493 }
9494
9495 /* 32 bit BIC shifted register.  */
9496 static void
9497 bic32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9498 {
9499   unsigned rm = INSTR (20, 16);
9500   unsigned rn = INSTR (9, 5);
9501   unsigned rd = INSTR (4, 0);
9502
9503   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9504   aarch64_set_reg_u64
9505     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9506      & ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9507 }
9508
9509 /* 64 bit BIC shifted register.  */
9510 static void
9511 bic64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9512 {
9513   unsigned rm = INSTR (20, 16);
9514   unsigned rn = INSTR (9, 5);
9515   unsigned rd = INSTR (4, 0);
9516
9517   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9518   aarch64_set_reg_u64
9519     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9520      & ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9521 }
9522
9523 /* 32 bit BIC shifted register setting flags.  */
9524 static void
9525 bics32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9526 {
9527   unsigned rm = INSTR (20, 16);
9528   unsigned rn = INSTR (9, 5);
9529   unsigned rd = INSTR (4, 0);
9530
9531   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9532   uint32_t value2 = ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
9533                                  shift, count);
9534
9535   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9536   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9537   set_flags_for_binop32 (cpu, value1 & value2);
9538 }
9539
9540 /* 64 bit BIC shifted register setting flags.  */
9541 static void
9542 bics64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9543 {
9544   unsigned rm = INSTR (20, 16);
9545   unsigned rn = INSTR (9, 5);
9546   unsigned rd = INSTR (4, 0);
9547
9548   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9549   uint64_t value2 = ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
9550                                  shift, count);
9551
9552   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9553   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9554   set_flags_for_binop64 (cpu, value1 & value2);
9555 }
9556
9557 /* 32 bit EON shifted register.  */
9558 static void
9559 eon32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9560 {
9561   unsigned rm = INSTR (20, 16);
9562   unsigned rn = INSTR (9, 5);
9563   unsigned rd = INSTR (4, 0);
9564
9565   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9566   aarch64_set_reg_u64
9567     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9568      ^ ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9569 }
9570
9571 /* 64 bit EON shifted register.  */
9572 static void
9573 eon64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9574 {
9575   unsigned rm = INSTR (20, 16);
9576   unsigned rn = INSTR (9, 5);
9577   unsigned rd = INSTR (4, 0);
9578
9579   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9580   aarch64_set_reg_u64
9581     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9582      ^ ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9583 }
9584
9585 /* 32 bit EOR shifted register.  */
9586 static void
9587 eor32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9588 {
9589   unsigned rm = INSTR (20, 16);
9590   unsigned rn = INSTR (9, 5);
9591   unsigned rd = INSTR (4, 0);
9592
9593   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9594   aarch64_set_reg_u64
9595     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9596      ^ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9597 }
9598
9599 /* 64 bit EOR shifted register.  */
9600 static void
9601 eor64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9602 {
9603   unsigned rm = INSTR (20, 16);
9604   unsigned rn = INSTR (9, 5);
9605   unsigned rd = INSTR (4, 0);
9606
9607   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9608   aarch64_set_reg_u64
9609     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9610      ^ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9611 }
9612
9613 /* 32 bit ORR shifted register.  */
9614 static void
9615 orr32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9616 {
9617   unsigned rm = INSTR (20, 16);
9618   unsigned rn = INSTR (9, 5);
9619   unsigned rd = INSTR (4, 0);
9620
9621   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9622   aarch64_set_reg_u64
9623     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9624      | shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9625 }
9626
9627 /* 64 bit ORR shifted register.  */
9628 static void
9629 orr64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9630 {
9631   unsigned rm = INSTR (20, 16);
9632   unsigned rn = INSTR (9, 5);
9633   unsigned rd = INSTR (4, 0);
9634
9635   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9636   aarch64_set_reg_u64
9637     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9638      | shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9639 }
9640
9641 /* 32 bit ORN shifted register.  */
9642 static void
9643 orn32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9644 {
9645   unsigned rm = INSTR (20, 16);
9646   unsigned rn = INSTR (9, 5);
9647   unsigned rd = INSTR (4, 0);
9648
9649   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9650   aarch64_set_reg_u64
9651     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9652      | ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9653 }
9654
9655 /* 64 bit ORN shifted register.  */
9656 static void
9657 orn64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9658 {
9659   unsigned rm = INSTR (20, 16);
9660   unsigned rn = INSTR (9, 5);
9661   unsigned rd = INSTR (4, 0);
9662
9663   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9664   aarch64_set_reg_u64
9665     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9666      | ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9667 }
9668
9669 static void
9670 dexLogicalImmediate (sim_cpu *cpu)
9671 {
9672   /* assert instr[28,23] = 1001000
9673      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
9674      instr[30,29] = op : 0 ==> AND, 1 ==> ORR, 2 ==> EOR, 3 ==> ANDS
9675      instr[22] = N : used to construct immediate mask
9676      instr[21,16] = immr
9677      instr[15,10] = imms
9678      instr[9,5] = Rn
9679      instr[4,0] = Rd  */
9680
9681   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
9682   uint32_t size = INSTR (31, 31);
9683   uint32_t N = INSTR (22, 22);
9684   /* uint32_t immr = INSTR (21, 16);.  */
9685   /* uint32_t imms = INSTR (15, 10);.  */
9686   uint32_t index = INSTR (22, 10);
9687   uint64_t bimm64 = LITable [index];
9688   uint32_t dispatch = INSTR (30, 29);
9689
9690   if (~size & N)
9691     HALT_UNALLOC;
9692
9693   if (!bimm64)
9694     HALT_UNALLOC;
9695
9696   if (size == 0)
9697     {
9698       uint32_t bimm = (uint32_t) bimm64;
9699
9700       switch (dispatch)
9701         {
9702         case 0: and32 (cpu, bimm); return;
9703         case 1: orr32 (cpu, bimm); return;
9704         case 2: eor32 (cpu, bimm); return;
9705         case 3: ands32 (cpu, bimm); return;
9706         }
9707     }
9708   else
9709     {
9710       switch (dispatch)
9711         {
9712         case 0: and64 (cpu, bimm64); return;
9713         case 1: orr64 (cpu, bimm64); return;
9714         case 2: eor64 (cpu, bimm64); return;
9715         case 3: ands64 (cpu, bimm64); return;
9716         }
9717     }
9718   HALT_UNALLOC;
9719 }
9720
9721 /* Immediate move.
9722    The uimm argument is a 16 bit value to be inserted into the
9723    target register the pos argument locates the 16 bit word in the
9724    dest register i.e. it is in {0, 1} for 32 bit and {0, 1, 2,
9725    3} for 64 bit.
9726    N.B register arg may not be SP so it should be.
9727    accessed using the setGZRegisterXXX accessors.  */
9728
9729 /* 32 bit move 16 bit immediate zero remaining shorts.  */
9730 static void
9731 movz32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9732 {
9733   unsigned rd = INSTR (4, 0);
9734
9735   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9736   aarch64_set_reg_u64 (cpu, rd, NO_SP, val << (pos * 16));
9737 }
9738
9739 /* 64 bit move 16 bit immediate zero remaining shorts.  */
9740 static void
9741 movz64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9742 {
9743   unsigned rd = INSTR (4, 0);
9744
9745   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9746   aarch64_set_reg_u64 (cpu, rd, NO_SP, ((uint64_t) val) << (pos * 16));
9747 }
9748
9749 /* 32 bit move 16 bit immediate negated.  */
9750 static void
9751 movn32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9752 {
9753   unsigned rd = INSTR (4, 0);
9754
9755   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9756   aarch64_set_reg_u64 (cpu, rd, NO_SP, ((val << (pos * 16)) ^ 0xffffffffU));
9757 }
9758
9759 /* 64 bit move 16 bit immediate negated.  */
9760 static void
9761 movn64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9762 {
9763   unsigned rd = INSTR (4, 0);
9764
9765   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9766   aarch64_set_reg_u64
9767     (cpu, rd, NO_SP, ((((uint64_t) val) << (pos * 16))
9768                       ^ 0xffffffffffffffffULL));
9769 }
9770
9771 /* 32 bit move 16 bit immediate keep remaining shorts.  */
9772 static void
9773 movk32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9774 {
9775   unsigned rd = INSTR (4, 0);
9776   uint32_t current = aarch64_get_reg_u32 (cpu, rd, NO_SP);
9777   uint32_t value = val << (pos * 16);
9778   uint32_t mask = ~(0xffffU << (pos * 16));
9779
9780   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9781   aarch64_set_reg_u64 (cpu, rd, NO_SP, (value | (current & mask)));
9782 }
9783
9784 /* 64 bit move 16 it immediate keep remaining shorts.  */
9785 static void
9786 movk64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9787 {
9788   unsigned rd = INSTR (4, 0);
9789   uint64_t current = aarch64_get_reg_u64 (cpu, rd, NO_SP);
9790   uint64_t value = (uint64_t) val << (pos * 16);
9791   uint64_t mask = ~(0xffffULL << (pos * 16));
9792
9793   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9794   aarch64_set_reg_u64 (cpu, rd, NO_SP, (value | (current & mask)));
9795 }
9796
9797 static void
9798 dexMoveWideImmediate (sim_cpu *cpu)
9799 {
9800   /* assert instr[28:23] = 100101
9801      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
9802      instr[30,29] = op : 0 ==> MOVN, 1 ==> UNALLOC, 2 ==> MOVZ, 3 ==> MOVK
9803      instr[22,21] = shift : 00 == LSL#0, 01 = LSL#16, 10 = LSL#32, 11 = LSL#48
9804      instr[20,5] = uimm16
9805      instr[4,0] = Rd  */
9806
9807   /* N.B. the (multiple of 16) shift is applied by the called routine,
9808      we just pass the multiplier.  */
9809
9810   uint32_t imm;
9811   uint32_t size = INSTR (31, 31);
9812   uint32_t op = INSTR (30, 29);
9813   uint32_t shift = INSTR (22, 21);
9814
9815   /* 32 bit can only shift 0 or 1 lot of 16.
9816      anything else is an unallocated instruction.  */
9817   if (size == 0 && (shift > 1))
9818     HALT_UNALLOC;
9819
9820   if (op == 1)
9821     HALT_UNALLOC;
9822
9823   imm = INSTR (20, 5);
9824
9825   if (size == 0)
9826     {
9827       if (op == 0)
9828         movn32 (cpu, imm, shift);
9829       else if (op == 2)
9830         movz32 (cpu, imm, shift);
9831       else
9832         movk32 (cpu, imm, shift);
9833     }
9834   else
9835     {
9836       if (op == 0)
9837         movn64 (cpu, imm, shift);
9838       else if (op == 2)
9839         movz64 (cpu, imm, shift);
9840       else
9841         movk64 (cpu, imm, shift);
9842     }
9843 }
9844
9845 /* Bitfield operations.
9846    These take a pair of bit positions r and s which are in {0..31}
9847    or {0..63} depending on the instruction word size.
9848    N.B register args may not be SP.  */
9849
9850 /* OK, we start with ubfm which just needs to pick
9851    some bits out of source zero the rest and write
9852    the result to dest.  Just need two logical shifts.  */
9853
9854 /* 32 bit bitfield move, left and right of affected zeroed
9855    if r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
9856 static void
9857 ubfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
9858 {
9859   unsigned rd;
9860   unsigned rn = INSTR (9, 5);
9861   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9862
9863   /* Pick either s+1-r or s+1 consecutive bits out of the original word.  */
9864   if (r <= s)
9865     {
9866       /* 31:...:s:xxx:r:...:0 ==> 31:...:s-r:xxx:0.
9867          We want only bits s:xxx:r at the bottom of the word
9868          so we LSL bit s up to bit 31 i.e. by 31 - s
9869          and then we LSR to bring bit 31 down to bit s - r
9870          i.e. by 31 + r - s.  */
9871       value <<= 31 - s;
9872       value >>= 31 + r - s;
9873     }
9874   else
9875     {
9876       /* 31:...:s:xxx:0 ==> 31:...:31-(r-1)+s:xxx:31-(r-1):...:0
9877          We want only bits s:xxx:0 starting at it 31-(r-1)
9878          so we LSL bit s up to bit 31 i.e. by 31 - s
9879          and then we LSL to bring bit 31 down to 31-(r-1)+s
9880          i.e. by r - (s + 1).  */
9881       value <<= 31 - s;
9882       value >>= r - (s + 1);
9883     }
9884
9885   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9886   rd = INSTR (4, 0);
9887   aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
9888 }
9889
9890 /* 64 bit bitfield move, left and right of affected zeroed
9891    if r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
9892 static void
9893 ubfm (sim_cpu *cpu, uint32_t r, uint32_t s)
9894 {
9895   unsigned rd;
9896   unsigned rn = INSTR (9, 5);
9897   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9898
9899   if (r <= s)
9900     {
9901       /* 63:...:s:xxx:r:...:0 ==> 63:...:s-r:xxx:0.
9902          We want only bits s:xxx:r at the bottom of the word.
9903          So we LSL bit s up to bit 63 i.e. by 63 - s
9904          and then we LSR to bring bit 63 down to bit s - r
9905          i.e. by 63 + r - s.  */
9906       value <<= 63 - s;
9907       value >>= 63 + r - s;
9908     }
9909   else
9910     {
9911       /* 63:...:s:xxx:0 ==> 63:...:63-(r-1)+s:xxx:63-(r-1):...:0.
9912          We want only bits s:xxx:0 starting at it 63-(r-1).
9913          So we LSL bit s up to bit 63 i.e. by 63 - s
9914          and then we LSL to bring bit 63 down to 63-(r-1)+s
9915          i.e. by r - (s + 1).  */
9916       value <<= 63 - s;
9917       value >>= r - (s + 1);
9918     }
9919
9920   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9921   rd = INSTR (4, 0);
9922   aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
9923 }
9924
9925 /* The signed versions need to insert sign bits
9926    on the left of the inserted bit field. so we do
9927    much the same as the unsigned version except we
9928    use an arithmetic shift right -- this just means
9929    we need to operate on signed values.  */
9930
9931 /* 32 bit bitfield move, left of affected sign-extended, right zeroed.  */
9932 /* If r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
9933 static void
9934 sbfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
9935 {
9936   unsigned rd;
9937   unsigned rn = INSTR (9, 5);
9938   /* as per ubfm32 but use an ASR instead of an LSR.  */
9939   int32_t value = aarch64_get_reg_s32 (cpu, rn, NO_SP);
9940
9941   if (r <= s)
9942     {
9943       value <<= 31 - s;
9944       value >>= 31 + r - s;
9945     }
9946   else
9947     {
9948       value <<= 31 - s;
9949       value >>= r - (s + 1);
9950     }
9951
9952   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9953   rd = INSTR (4, 0);
9954   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
9955 }
9956
9957 /* 64 bit bitfield move, left of affected sign-extended, right zeroed.  */
9958 /* If r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
9959 static void
9960 sbfm (sim_cpu *cpu, uint32_t r, uint32_t s)
9961 {
9962   unsigned rd;
9963   unsigned rn = INSTR (9, 5);
9964   /* acpu per ubfm but use an ASR instead of an LSR.  */
9965   int64_t value = aarch64_get_reg_s64 (cpu, rn, NO_SP);
9966
9967   if (r <= s)
9968     {
9969       value <<= 63 - s;
9970       value >>= 63 + r - s;
9971     }
9972   else
9973     {
9974       value <<= 63 - s;
9975       value >>= r - (s + 1);
9976     }
9977
9978   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9979   rd = INSTR (4, 0);
9980   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
9981 }
9982
9983 /* Finally, these versions leave non-affected bits
9984    as is. so we need to generate the bits as per
9985    ubfm and also generate a mask to pick the
9986    bits from the original and computed values.  */
9987
9988 /* 32 bit bitfield move, non-affected bits left as is.
9989    If r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
9990 static void
9991 bfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
9992 {
9993   unsigned rn = INSTR (9, 5);
9994   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9995   uint32_t mask = -1;
9996   unsigned rd;
9997   uint32_t value2;
9998
9999   /* Pick either s+1-r or s+1 consecutive bits out of the original word.  */
10000   if (r <= s)
10001     {
10002       /* 31:...:s:xxx:r:...:0 ==> 31:...:s-r:xxx:0.
10003          We want only bits s:xxx:r at the bottom of the word
10004          so we LSL bit s up to bit 31 i.e. by 31 - s
10005          and then we LSR to bring bit 31 down to bit s - r
10006          i.e. by 31 + r - s.  */
10007       value <<= 31 - s;
10008       value >>= 31 + r - s;
10009       /* the mask must include the same bits.  */
10010       mask <<= 31 - s;
10011       mask >>= 31 + r - s;
10012     }
10013   else
10014     {
10015       /* 31:...:s:xxx:0 ==> 31:...:31-(r-1)+s:xxx:31-(r-1):...:0.
10016          We want only bits s:xxx:0 starting at it 31-(r-1)
10017          so we LSL bit s up to bit 31 i.e. by 31 - s
10018          and then we LSL to bring bit 31 down to 31-(r-1)+s
10019          i.e. by r - (s + 1).  */
10020       value <<= 31 - s;
10021       value >>= r - (s + 1);
10022       /* The mask must include the same bits.  */
10023       mask <<= 31 - s;
10024       mask >>= r - (s + 1);
10025     }
10026
10027   rd = INSTR (4, 0);
10028   value2 = aarch64_get_reg_u32 (cpu, rd, NO_SP);
10029
10030   value2 &= ~mask;
10031   value2 |= value;
10032
10033   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10034   aarch64_set_reg_u64
10035     (cpu, rd, NO_SP, (aarch64_get_reg_u32 (cpu, rd, NO_SP) & ~mask) | value);
10036 }
10037
10038 /* 64 bit bitfield move, non-affected bits left as is.
10039    If r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
10040 static void
10041 bfm (sim_cpu *cpu, uint32_t r, uint32_t s)
10042 {
10043   unsigned rd;
10044   unsigned rn = INSTR (9, 5);
10045   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
10046   uint64_t mask = 0xffffffffffffffffULL;
10047
10048   if (r <= s)
10049     {
10050       /* 63:...:s:xxx:r:...:0 ==> 63:...:s-r:xxx:0.
10051          We want only bits s:xxx:r at the bottom of the word
10052          so we LSL bit s up to bit 63 i.e. by 63 - s
10053          and then we LSR to bring bit 63 down to bit s - r
10054          i.e. by 63 + r - s.  */
10055       value <<= 63 - s;
10056       value >>= 63 + r - s;
10057       /* The mask must include the same bits.  */
10058       mask <<= 63 - s;
10059       mask >>= 63 + r - s;
10060     }
10061   else
10062     {
10063       /* 63:...:s:xxx:0 ==> 63:...:63-(r-1)+s:xxx:63-(r-1):...:0
10064          We want only bits s:xxx:0 starting at it 63-(r-1)
10065          so we LSL bit s up to bit 63 i.e. by 63 - s
10066          and then we LSL to bring bit 63 down to 63-(r-1)+s
10067          i.e. by r - (s + 1).  */
10068       value <<= 63 - s;
10069       value >>= r - (s + 1);
10070       /* The mask must include the same bits.  */
10071       mask <<= 63 - s;
10072       mask >>= r - (s + 1);
10073     }
10074
10075   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10076   rd = INSTR (4, 0);
10077   aarch64_set_reg_u64
10078     (cpu, rd, NO_SP, (aarch64_get_reg_u64 (cpu, rd, NO_SP) & ~mask) | value);
10079 }
10080
10081 static void
10082 dexBitfieldImmediate (sim_cpu *cpu)
10083 {
10084   /* assert instr[28:23] = 100110
10085      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
10086      instr[30,29] = op : 0 ==> SBFM, 1 ==> BFM, 2 ==> UBFM, 3 ==> UNALLOC
10087      instr[22] = N : must be 0 for 32 bit, 1 for 64 bit ow UNALLOC
10088      instr[21,16] = immr : 0xxxxx for 32 bit, xxxxxx for 64 bit
10089      instr[15,10] = imms :  0xxxxx for 32 bit, xxxxxx for 64 bit
10090      instr[9,5] = Rn
10091      instr[4,0] = Rd  */
10092
10093   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
10094   uint32_t dispatch;
10095   uint32_t imms;
10096   uint32_t size = INSTR (31, 31);
10097   uint32_t N = INSTR (22, 22);
10098   /* 32 bit operations must have immr[5] = 0 and imms[5] = 0.  */
10099   /* or else we have an UNALLOC.  */
10100   uint32_t immr = INSTR (21, 16);
10101
10102   if (~size & N)
10103     HALT_UNALLOC;
10104
10105   if (!size && uimm (immr, 5, 5))
10106     HALT_UNALLOC;
10107
10108   imms = INSTR (15, 10);
10109   if (!size && uimm (imms, 5, 5))
10110     HALT_UNALLOC;
10111
10112   /* Switch on combined size and op.  */
10113   dispatch = INSTR (31, 29);
10114   switch (dispatch)
10115     {
10116     case 0: sbfm32 (cpu, immr, imms); return;
10117     case 1: bfm32 (cpu, immr, imms); return;
10118     case 2: ubfm32 (cpu, immr, imms); return;
10119     case 4: sbfm (cpu, immr, imms); return;
10120     case 5: bfm (cpu, immr, imms); return;
10121     case 6: ubfm (cpu, immr, imms); return;
10122     default: HALT_UNALLOC;
10123     }
10124 }
10125
10126 static void
10127 do_EXTR_32 (sim_cpu *cpu)
10128 {
10129   /* instr[31:21] = 00010011100
10130      instr[20,16] = Rm
10131      instr[15,10] = imms :  0xxxxx for 32 bit
10132      instr[9,5]   = Rn
10133      instr[4,0]   = Rd  */
10134   unsigned rm   = INSTR (20, 16);
10135   unsigned imms = INSTR (15, 10) & 31;
10136   unsigned rn   = INSTR ( 9,  5);
10137   unsigned rd   = INSTR ( 4,  0);
10138   uint64_t val1;
10139   uint64_t val2;
10140
10141   val1 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
10142   val1 >>= imms;
10143   val2 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
10144   val2 <<= (32 - imms);
10145
10146   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10147   aarch64_set_reg_u64 (cpu, rd, NO_SP, val1 | val2);
10148 }
10149
10150 static void
10151 do_EXTR_64 (sim_cpu *cpu)
10152 {
10153   /* instr[31:21] = 10010011100
10154      instr[20,16] = Rm
10155      instr[15,10] = imms
10156      instr[9,5]   = Rn
10157      instr[4,0]   = Rd  */
10158   unsigned rm   = INSTR (20, 16);
10159   unsigned imms = INSTR (15, 10) & 63;
10160   unsigned rn   = INSTR ( 9,  5);
10161   unsigned rd   = INSTR ( 4,  0);
10162   uint64_t val;
10163
10164   val = aarch64_get_reg_u64 (cpu, rm, NO_SP);
10165   val >>= imms;
10166   val |= (aarch64_get_reg_u64 (cpu, rn, NO_SP) << (64 - imms));
10167
10168   aarch64_set_reg_u64 (cpu, rd, NO_SP, val);
10169 }
10170
10171 static void
10172 dexExtractImmediate (sim_cpu *cpu)
10173 {
10174   /* assert instr[28:23] = 100111
10175      instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
10176      instr[30,29] = op21 : 0 ==> EXTR, 1,2,3 ==> UNALLOC
10177      instr[22]    = N : must be 0 for 32 bit, 1 for 64 bit or UNALLOC
10178      instr[21]    = op0 : must be 0 or UNALLOC
10179      instr[20,16] = Rm
10180      instr[15,10] = imms :  0xxxxx for 32 bit, xxxxxx for 64 bit
10181      instr[9,5]   = Rn
10182      instr[4,0]   = Rd  */
10183
10184   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
10185   /* 64 bit operations must have N = 1 or else we have an UNALLOC.  */
10186   uint32_t dispatch;
10187   uint32_t size = INSTR (31, 31);
10188   uint32_t N = INSTR (22, 22);
10189   /* 32 bit operations must have imms[5] = 0
10190      or else we have an UNALLOC.  */
10191   uint32_t imms = INSTR (15, 10);
10192
10193   if (size ^ N)
10194     HALT_UNALLOC;
10195
10196   if (!size && uimm (imms, 5, 5))
10197     HALT_UNALLOC;
10198
10199   /* Switch on combined size and op.  */
10200   dispatch = INSTR (31, 29);
10201
10202   if (dispatch == 0)
10203     do_EXTR_32 (cpu);
10204
10205   else if (dispatch == 4)
10206     do_EXTR_64 (cpu);
10207
10208   else if (dispatch == 1)
10209     HALT_NYI;
10210   else
10211     HALT_UNALLOC;
10212 }
10213
10214 static void
10215 dexDPImm (sim_cpu *cpu)
10216 {
10217   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
10218      assert  group == GROUP_DPIMM_1000 || grpoup == GROUP_DPIMM_1001
10219      bits [25,23] of a DPImm are the secondary dispatch vector.  */
10220   uint32_t group2 = dispatchDPImm (aarch64_get_instr (cpu));
10221
10222   switch (group2)
10223     {
10224     case DPIMM_PCADR_000:
10225     case DPIMM_PCADR_001:
10226       dexPCRelAddressing (cpu);
10227       return;
10228
10229     case DPIMM_ADDSUB_010:
10230     case DPIMM_ADDSUB_011:
10231       dexAddSubtractImmediate (cpu);
10232       return;
10233
10234     case DPIMM_LOG_100:
10235       dexLogicalImmediate (cpu);
10236       return;
10237
10238     case DPIMM_MOV_101:
10239       dexMoveWideImmediate (cpu);
10240       return;
10241
10242     case DPIMM_BITF_110:
10243       dexBitfieldImmediate (cpu);
10244       return;
10245
10246     case DPIMM_EXTR_111:
10247       dexExtractImmediate (cpu);
10248       return;
10249
10250     default:
10251       /* Should never reach here.  */
10252       HALT_NYI;
10253     }
10254 }
10255
10256 static void
10257 dexLoadUnscaledImmediate (sim_cpu *cpu)
10258 {
10259   /* instr[29,24] == 111_00
10260      instr[21] == 0
10261      instr[11,10] == 00
10262      instr[31,30] = size
10263      instr[26] = V
10264      instr[23,22] = opc
10265      instr[20,12] = simm9
10266      instr[9,5] = rn may be SP.  */
10267   /* unsigned rt = INSTR (4, 0);  */
10268   uint32_t V = INSTR (26, 26);
10269   uint32_t dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10270   int32_t imm = simm32 (aarch64_get_instr (cpu), 20, 12);
10271
10272   if (!V)
10273     {
10274       /* GReg operations.  */
10275       switch (dispatch)
10276         {
10277         case 0:  sturb (cpu, imm); return;
10278         case 1:  ldurb32 (cpu, imm); return;
10279         case 2:  ldursb64 (cpu, imm); return;
10280         case 3:  ldursb32 (cpu, imm); return;
10281         case 4:  sturh (cpu, imm); return;
10282         case 5:  ldurh32 (cpu, imm); return;
10283         case 6:  ldursh64 (cpu, imm); return;
10284         case 7:  ldursh32 (cpu, imm); return;
10285         case 8:  stur32 (cpu, imm); return;
10286         case 9:  ldur32 (cpu, imm); return;
10287         case 10: ldursw (cpu, imm); return;
10288         case 12: stur64 (cpu, imm); return;
10289         case 13: ldur64 (cpu, imm); return;
10290
10291         case 14:
10292           /* PRFUM NYI.  */
10293           HALT_NYI;
10294
10295         default:
10296         case 11:
10297         case 15:
10298           HALT_UNALLOC;
10299         }
10300     }
10301
10302   /* FReg operations.  */
10303   switch (dispatch)
10304     {
10305     case 2:  fsturq (cpu, imm); return;
10306     case 3:  fldurq (cpu, imm); return;
10307     case 8:  fsturs (cpu, imm); return;
10308     case 9:  fldurs (cpu, imm); return;
10309     case 12: fsturd (cpu, imm); return;
10310     case 13: fldurd (cpu, imm); return;
10311
10312     case 0: /* STUR 8 bit FP.  */
10313     case 1: /* LDUR 8 bit FP.  */
10314     case 4: /* STUR 16 bit FP.  */
10315     case 5: /* LDUR 8 bit FP.  */
10316       HALT_NYI;
10317
10318     default:
10319     case 6:
10320     case 7:
10321     case 10:
10322     case 11:
10323     case 14:
10324     case 15:
10325       HALT_UNALLOC;
10326     }
10327 }
10328
10329 /*  N.B. A preliminary note regarding all the ldrs<x>32
10330     instructions
10331
10332    The signed value loaded by these instructions is cast to unsigned
10333    before being assigned to aarch64_get_reg_u64 (cpu, N) i.e. to the
10334    64 bit element of the GReg union. this performs a 32 bit sign extension
10335    (as required) but avoids 64 bit sign extension, thus ensuring that the
10336    top half of the register word is zero. this is what the spec demands
10337    when a 32 bit load occurs.  */
10338
10339 /* 32 bit load sign-extended byte scaled unsigned 12 bit.  */
10340 static void
10341 ldrsb32_abs (sim_cpu *cpu, uint32_t offset)
10342 {
10343   unsigned int rn = INSTR (9, 5);
10344   unsigned int rt = INSTR (4, 0);
10345
10346   /* The target register may not be SP but the source may be
10347      there is no scaling required for a byte load.  */
10348   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
10349   aarch64_set_reg_u64 (cpu, rt, NO_SP,
10350                        (int64_t) aarch64_get_mem_s8 (cpu, address));
10351 }
10352
10353 /* 32 bit load sign-extended byte scaled or unscaled zero-
10354    or sign-extended 32-bit register offset.  */
10355 static void
10356 ldrsb32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10357 {
10358   unsigned int rm = INSTR (20, 16);
10359   unsigned int rn = INSTR (9, 5);
10360   unsigned int rt = INSTR (4, 0);
10361
10362   /* rn may reference SP, rm and rt must reference ZR.  */
10363
10364   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10365   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10366                                  extension);
10367
10368   /* There is no scaling required for a byte load.  */
10369   aarch64_set_reg_u64
10370     (cpu, rt, NO_SP, (int64_t) aarch64_get_mem_s8 (cpu, address
10371                                                    + displacement));
10372 }
10373
10374 /* 32 bit load sign-extended byte unscaled signed 9 bit with
10375    pre- or post-writeback.  */
10376 static void
10377 ldrsb32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10378 {
10379   uint64_t address;
10380   unsigned int rn = INSTR (9, 5);
10381   unsigned int rt = INSTR (4, 0);
10382
10383   if (rn == rt && wb != NoWriteBack)
10384     HALT_UNALLOC;
10385
10386   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10387
10388   if (wb == Pre)
10389       address += offset;
10390
10391   aarch64_set_reg_u64 (cpu, rt, NO_SP,
10392                        (int64_t) aarch64_get_mem_s8 (cpu, address));
10393
10394   if (wb == Post)
10395     address += offset;
10396
10397   if (wb != NoWriteBack)
10398     aarch64_set_reg_u64 (cpu, rn, NO_SP, address);
10399 }
10400
10401 /* 8 bit store scaled.  */
10402 static void
10403 fstrb_abs (sim_cpu *cpu, uint32_t offset)
10404 {
10405   unsigned st = INSTR (4, 0);
10406   unsigned rn = INSTR (9, 5);
10407
10408   aarch64_set_mem_u8 (cpu,
10409                       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
10410                       aarch64_get_vec_u8 (cpu, st, 0));
10411 }
10412
10413 /* 8 bit store scaled or unscaled zero- or
10414    sign-extended 8-bit register offset.  */
10415 static void
10416 fstrb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10417 {
10418   unsigned rm = INSTR (20, 16);
10419   unsigned rn = INSTR (9, 5);
10420   unsigned st = INSTR (4, 0);
10421
10422   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10423   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10424                                extension);
10425   uint64_t  displacement = scaling == Scaled ? extended : 0;
10426
10427   aarch64_set_mem_u8
10428     (cpu, address + displacement, aarch64_get_vec_u8 (cpu, st, 0));
10429 }
10430
10431 /* 16 bit store scaled.  */
10432 static void
10433 fstrh_abs (sim_cpu *cpu, uint32_t offset)
10434 {
10435   unsigned st = INSTR (4, 0);
10436   unsigned rn = INSTR (9, 5);
10437
10438   aarch64_set_mem_u16
10439     (cpu,
10440      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 16),
10441      aarch64_get_vec_u16 (cpu, st, 0));
10442 }
10443
10444 /* 16 bit store scaled or unscaled zero-
10445    or sign-extended 16-bit register offset.  */
10446 static void
10447 fstrh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10448 {
10449   unsigned rm = INSTR (20, 16);
10450   unsigned rn = INSTR (9, 5);
10451   unsigned st = INSTR (4, 0);
10452
10453   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10454   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10455                                extension);
10456   uint64_t  displacement = OPT_SCALE (extended, 16, scaling);
10457
10458   aarch64_set_mem_u16
10459     (cpu, address + displacement, aarch64_get_vec_u16 (cpu, st, 0));
10460 }
10461
10462 /* 32 bit store scaled unsigned 12 bit.  */
10463 static void
10464 fstrs_abs (sim_cpu *cpu, uint32_t offset)
10465 {
10466   unsigned st = INSTR (4, 0);
10467   unsigned rn = INSTR (9, 5);
10468
10469   aarch64_set_mem_u32
10470     (cpu,
10471      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 32),
10472      aarch64_get_vec_u32 (cpu, st, 0));
10473 }
10474
10475 /* 32 bit store unscaled signed 9 bit with pre- or post-writeback.  */
10476 static void
10477 fstrs_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10478 {
10479   unsigned rn = INSTR (9, 5);
10480   unsigned st = INSTR (4, 0);
10481
10482   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10483
10484   if (wb != Post)
10485     address += offset;
10486
10487   aarch64_set_mem_u32 (cpu, address, aarch64_get_vec_u32 (cpu, st, 0));
10488
10489   if (wb == Post)
10490     address += offset;
10491
10492   if (wb != NoWriteBack)
10493     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
10494 }
10495
10496 /* 32 bit store scaled or unscaled zero-
10497    or sign-extended 32-bit register offset.  */
10498 static void
10499 fstrs_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10500 {
10501   unsigned rm = INSTR (20, 16);
10502   unsigned rn = INSTR (9, 5);
10503   unsigned st = INSTR (4, 0);
10504
10505   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10506   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10507                                extension);
10508   uint64_t  displacement = OPT_SCALE (extended, 32, scaling);
10509
10510   aarch64_set_mem_u32
10511     (cpu, address + displacement, aarch64_get_vec_u32 (cpu, st, 0));
10512 }
10513
10514 /* 64 bit store scaled unsigned 12 bit.  */
10515 static void
10516 fstrd_abs (sim_cpu *cpu, uint32_t offset)
10517 {
10518   unsigned st = INSTR (4, 0);
10519   unsigned rn = INSTR (9, 5);
10520
10521   aarch64_set_mem_u64
10522     (cpu,
10523      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 64),
10524      aarch64_get_vec_u64 (cpu, st, 0));
10525 }
10526
10527 /* 64 bit store unscaled signed 9 bit with pre- or post-writeback.  */
10528 static void
10529 fstrd_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10530 {
10531   unsigned rn = INSTR (9, 5);
10532   unsigned st = INSTR (4, 0);
10533
10534   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10535
10536   if (wb != Post)
10537     address += offset;
10538
10539   aarch64_set_mem_u64 (cpu, address, aarch64_get_vec_u64 (cpu, st, 0));
10540
10541   if (wb == Post)
10542     address += offset;
10543
10544   if (wb != NoWriteBack)
10545     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
10546 }
10547
10548 /* 64 bit store scaled or unscaled zero-
10549    or sign-extended 32-bit register offset.  */
10550 static void
10551 fstrd_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10552 {
10553   unsigned rm = INSTR (20, 16);
10554   unsigned rn = INSTR (9, 5);
10555   unsigned st = INSTR (4, 0);
10556
10557   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10558   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10559                                extension);
10560   uint64_t  displacement = OPT_SCALE (extended, 64, scaling);
10561
10562   aarch64_set_mem_u64
10563     (cpu, address + displacement, aarch64_get_vec_u64 (cpu, st, 0));
10564 }
10565
10566 /* 128 bit store scaled unsigned 12 bit.  */
10567 static void
10568 fstrq_abs (sim_cpu *cpu, uint32_t offset)
10569 {
10570   FRegister a;
10571   unsigned st = INSTR (4, 0);
10572   unsigned rn = INSTR (9, 5);
10573   uint64_t addr;
10574
10575   aarch64_get_FP_long_double (cpu, st, & a);
10576
10577   addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 128);
10578   aarch64_set_mem_long_double (cpu, addr, a);
10579 }
10580
10581 /* 128 bit store unscaled signed 9 bit with pre- or post-writeback.  */
10582 static void
10583 fstrq_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10584 {
10585   FRegister a;
10586   unsigned rn = INSTR (9, 5);
10587   unsigned st = INSTR (4, 0);
10588   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10589
10590   if (wb != Post)
10591     address += offset;
10592
10593   aarch64_get_FP_long_double (cpu, st, & a);
10594   aarch64_set_mem_long_double (cpu, address, a);
10595
10596   if (wb == Post)
10597     address += offset;
10598
10599   if (wb != NoWriteBack)
10600     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
10601 }
10602
10603 /* 128 bit store scaled or unscaled zero-
10604    or sign-extended 32-bit register offset.  */
10605 static void
10606 fstrq_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10607 {
10608   unsigned rm = INSTR (20, 16);
10609   unsigned rn = INSTR (9, 5);
10610   unsigned st = INSTR (4, 0);
10611
10612   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10613   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10614                                extension);
10615   uint64_t  displacement = OPT_SCALE (extended, 128, scaling);
10616
10617   FRegister a;
10618
10619   aarch64_get_FP_long_double (cpu, st, & a);
10620   aarch64_set_mem_long_double (cpu, address + displacement, a);
10621 }
10622
10623 static void
10624 dexLoadImmediatePrePost (sim_cpu *cpu)
10625 {
10626   /* instr[31,30] = size
10627      instr[29,27] = 111
10628      instr[26]    = V
10629      instr[25,24] = 00
10630      instr[23,22] = opc
10631      instr[21]    = 0
10632      instr[20,12] = simm9
10633      instr[11]    = wb : 0 ==> Post, 1 ==> Pre
10634      instr[10]    = 0
10635      instr[9,5]   = Rn may be SP.
10636      instr[4,0]   = Rt */
10637
10638   uint32_t  V        = INSTR (26, 26);
10639   uint32_t  dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10640   int32_t   imm      = simm32 (aarch64_get_instr (cpu), 20, 12);
10641   WriteBack wb       = INSTR (11, 11);
10642
10643   if (!V)
10644     {
10645       /* GReg operations.  */
10646       switch (dispatch)
10647         {
10648         case 0:  strb_wb (cpu, imm, wb); return;
10649         case 1:  ldrb32_wb (cpu, imm, wb); return;
10650         case 2:  ldrsb_wb (cpu, imm, wb); return;
10651         case 3:  ldrsb32_wb (cpu, imm, wb); return;
10652         case 4:  strh_wb (cpu, imm, wb); return;
10653         case 5:  ldrh32_wb (cpu, imm, wb); return;
10654         case 6:  ldrsh64_wb (cpu, imm, wb); return;
10655         case 7:  ldrsh32_wb (cpu, imm, wb); return;
10656         case 8:  str32_wb (cpu, imm, wb); return;
10657         case 9:  ldr32_wb (cpu, imm, wb); return;
10658         case 10: ldrsw_wb (cpu, imm, wb); return;
10659         case 12: str_wb (cpu, imm, wb); return;
10660         case 13: ldr_wb (cpu, imm, wb); return;
10661
10662         default:
10663         case 11:
10664         case 14:
10665         case 15:
10666           HALT_UNALLOC;
10667         }
10668     }
10669
10670   /* FReg operations.  */
10671   switch (dispatch)
10672     {
10673     case 2:  fstrq_wb (cpu, imm, wb); return;
10674     case 3:  fldrq_wb (cpu, imm, wb); return;
10675     case 8:  fstrs_wb (cpu, imm, wb); return;
10676     case 9:  fldrs_wb (cpu, imm, wb); return;
10677     case 12: fstrd_wb (cpu, imm, wb); return;
10678     case 13: fldrd_wb (cpu, imm, wb); return;
10679
10680     case 0:       /* STUR 8 bit FP.  */
10681     case 1:       /* LDUR 8 bit FP.  */
10682     case 4:       /* STUR 16 bit FP.  */
10683     case 5:       /* LDUR 8 bit FP.  */
10684       HALT_NYI;
10685
10686     default:
10687     case 6:
10688     case 7:
10689     case 10:
10690     case 11:
10691     case 14:
10692     case 15:
10693       HALT_UNALLOC;
10694     }
10695 }
10696
10697 static void
10698 dexLoadRegisterOffset (sim_cpu *cpu)
10699 {
10700   /* instr[31,30] = size
10701      instr[29,27] = 111
10702      instr[26]    = V
10703      instr[25,24] = 00
10704      instr[23,22] = opc
10705      instr[21]    = 1
10706      instr[20,16] = rm
10707      instr[15,13] = option : 010 ==> UXTW, 011 ==> UXTX/LSL,
10708                              110 ==> SXTW, 111 ==> SXTX,
10709                              ow ==> RESERVED
10710      instr[12]    = scaled
10711      instr[11,10] = 10
10712      instr[9,5]   = rn
10713      instr[4,0]   = rt.  */
10714
10715   uint32_t  V = INSTR (26, 26);
10716   uint32_t  dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10717   Scaling   scale = INSTR (12, 12);
10718   Extension extensionType = INSTR (15, 13);
10719
10720   /* Check for illegal extension types.  */
10721   if (uimm (extensionType, 1, 1) == 0)
10722     HALT_UNALLOC;
10723
10724   if (extensionType == UXTX || extensionType == SXTX)
10725     extensionType = NoExtension;
10726
10727   if (!V)
10728     {
10729       /* GReg operations.  */
10730       switch (dispatch)
10731         {
10732         case 0:  strb_scale_ext (cpu, scale, extensionType); return;
10733         case 1:  ldrb32_scale_ext (cpu, scale, extensionType); return;
10734         case 2:  ldrsb_scale_ext (cpu, scale, extensionType); return;
10735         case 3:  ldrsb32_scale_ext (cpu, scale, extensionType); return;
10736         case 4:  strh_scale_ext (cpu, scale, extensionType); return;
10737         case 5:  ldrh32_scale_ext (cpu, scale, extensionType); return;
10738         case 6:  ldrsh_scale_ext (cpu, scale, extensionType); return;
10739         case 7:  ldrsh32_scale_ext (cpu, scale, extensionType); return;
10740         case 8:  str32_scale_ext (cpu, scale, extensionType); return;
10741         case 9:  ldr32_scale_ext (cpu, scale, extensionType); return;
10742         case 10: ldrsw_scale_ext (cpu, scale, extensionType); return;
10743         case 12: str_scale_ext (cpu, scale, extensionType); return;
10744         case 13: ldr_scale_ext (cpu, scale, extensionType); return;
10745         case 14: prfm_scale_ext (cpu, scale, extensionType); return;
10746
10747         default:
10748         case 11:
10749         case 15:
10750           HALT_UNALLOC;
10751         }
10752     }
10753
10754   /* FReg operations.  */
10755   switch (dispatch)
10756     {
10757     case 1: /* LDUR 8 bit FP.  */
10758       HALT_NYI;
10759     case 3:  fldrq_scale_ext (cpu, scale, extensionType); return;
10760     case 5: /* LDUR 8 bit FP.  */
10761       HALT_NYI;
10762     case 9:  fldrs_scale_ext (cpu, scale, extensionType); return;
10763     case 13: fldrd_scale_ext (cpu, scale, extensionType); return;
10764
10765     case 0:  fstrb_scale_ext (cpu, scale, extensionType); return;
10766     case 2:  fstrq_scale_ext (cpu, scale, extensionType); return;
10767     case 4:  fstrh_scale_ext (cpu, scale, extensionType); return;
10768     case 8:  fstrs_scale_ext (cpu, scale, extensionType); return;
10769     case 12: fstrd_scale_ext (cpu, scale, extensionType); return;
10770
10771     default:
10772     case 6:
10773     case 7:
10774     case 10:
10775     case 11:
10776     case 14:
10777     case 15:
10778       HALT_UNALLOC;
10779     }
10780 }
10781
10782 static void
10783 dexLoadUnsignedImmediate (sim_cpu *cpu)
10784 {
10785   /* instr[29,24] == 111_01
10786      instr[31,30] = size
10787      instr[26]    = V
10788      instr[23,22] = opc
10789      instr[21,10] = uimm12 : unsigned immediate offset
10790      instr[9,5]   = rn may be SP.
10791      instr[4,0]   = rt.  */
10792
10793   uint32_t V = INSTR (26,26);
10794   uint32_t dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10795   uint32_t imm = INSTR (21, 10);
10796
10797   if (!V)
10798     {
10799       /* GReg operations.  */
10800       switch (dispatch)
10801         {
10802         case 0:  strb_abs (cpu, imm); return;
10803         case 1:  ldrb32_abs (cpu, imm); return;
10804         case 2:  ldrsb_abs (cpu, imm); return;
10805         case 3:  ldrsb32_abs (cpu, imm); return;
10806         case 4:  strh_abs (cpu, imm); return;
10807         case 5:  ldrh32_abs (cpu, imm); return;
10808         case 6:  ldrsh_abs (cpu, imm); return;
10809         case 7:  ldrsh32_abs (cpu, imm); return;
10810         case 8:  str32_abs (cpu, imm); return;
10811         case 9:  ldr32_abs (cpu, imm); return;
10812         case 10: ldrsw_abs (cpu, imm); return;
10813         case 12: str_abs (cpu, imm); return;
10814         case 13: ldr_abs (cpu, imm); return;
10815         case 14: prfm_abs (cpu, imm); return;
10816
10817         default:
10818         case 11:
10819         case 15:
10820           HALT_UNALLOC;
10821         }
10822     }
10823
10824   /* FReg operations.  */
10825   switch (dispatch)
10826     {
10827     case 0:  fstrb_abs (cpu, imm); return;
10828     case 4:  fstrh_abs (cpu, imm); return;
10829     case 8:  fstrs_abs (cpu, imm); return;
10830     case 12: fstrd_abs (cpu, imm); return;
10831     case 2:  fstrq_abs (cpu, imm); return;
10832
10833     case 1:  fldrb_abs (cpu, imm); return;
10834     case 5:  fldrh_abs (cpu, imm); return;
10835     case 9:  fldrs_abs (cpu, imm); return;
10836     case 13: fldrd_abs (cpu, imm); return;
10837     case 3:  fldrq_abs (cpu, imm); return;
10838
10839     default:
10840     case 6:
10841     case 7:
10842     case 10:
10843     case 11:
10844     case 14:
10845     case 15:
10846       HALT_UNALLOC;
10847     }
10848 }
10849
10850 static void
10851 dexLoadExclusive (sim_cpu *cpu)
10852 {
10853   /* assert instr[29:24] = 001000;
10854      instr[31,30] = size
10855      instr[23] = 0 if exclusive
10856      instr[22] = L : 1 if load, 0 if store
10857      instr[21] = 1 if pair
10858      instr[20,16] = Rs
10859      instr[15] = o0 : 1 if ordered
10860      instr[14,10] = Rt2
10861      instr[9,5] = Rn
10862      instr[4.0] = Rt.  */
10863
10864   switch (INSTR (22, 21))
10865     {
10866     case 2:   ldxr (cpu); return;
10867     case 0:   stxr (cpu); return;
10868     default:  HALT_NYI;
10869     }
10870 }
10871
10872 static void
10873 dexLoadOther (sim_cpu *cpu)
10874 {
10875   uint32_t dispatch;
10876
10877   /* instr[29,25] = 111_0
10878      instr[24] == 0 ==> dispatch, 1 ==> ldst reg unsigned immediate
10879      instr[21:11,10] is the secondary dispatch.  */
10880   if (INSTR (24, 24))
10881     {
10882       dexLoadUnsignedImmediate (cpu);
10883       return;
10884     }
10885
10886   dispatch = ((INSTR (21, 21) << 2) | INSTR (11, 10));
10887   switch (dispatch)
10888     {
10889     case 0: dexLoadUnscaledImmediate (cpu); return;
10890     case 1: dexLoadImmediatePrePost (cpu); return;
10891     case 3: dexLoadImmediatePrePost (cpu); return;
10892     case 6: dexLoadRegisterOffset (cpu); return;
10893
10894     default:
10895     case 2:
10896     case 4:
10897     case 5:
10898     case 7:
10899       HALT_NYI;
10900     }
10901 }
10902
10903 static void
10904 store_pair_u32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
10905 {
10906   unsigned rn = INSTR (14, 10);
10907   unsigned rd = INSTR (9, 5);
10908   unsigned rm = INSTR (4, 0);
10909   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
10910
10911   if ((rn == rd || rm == rd) && wb != NoWriteBack)
10912     HALT_UNALLOC; /* ??? */
10913
10914   offset <<= 2;
10915
10916   if (wb != Post)
10917     address += offset;
10918
10919   aarch64_set_mem_u32 (cpu, address,
10920                        aarch64_get_reg_u32 (cpu, rm, NO_SP));
10921   aarch64_set_mem_u32 (cpu, address + 4,
10922                        aarch64_get_reg_u32 (cpu, rn, NO_SP));
10923
10924   if (wb == Post)
10925     address += offset;
10926
10927   if (wb != NoWriteBack)
10928     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
10929 }
10930
10931 static void
10932 store_pair_u64 (sim_cpu *cpu, int32_t offset, WriteBack wb)
10933 {
10934   unsigned rn = INSTR (14, 10);
10935   unsigned rd = INSTR (9, 5);
10936   unsigned rm = INSTR (4, 0);
10937   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
10938
10939   if ((rn == rd || rm == rd) && wb != NoWriteBack)
10940     HALT_UNALLOC; /* ??? */
10941
10942   offset <<= 3;
10943
10944   if (wb != Post)
10945     address += offset;
10946
10947   aarch64_set_mem_u64 (cpu, address,
10948                        aarch64_get_reg_u64 (cpu, rm, NO_SP));
10949   aarch64_set_mem_u64 (cpu, address + 8,
10950                        aarch64_get_reg_u64 (cpu, rn, NO_SP));
10951
10952   if (wb == Post)
10953     address += offset;
10954
10955   if (wb != NoWriteBack)
10956     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
10957 }
10958
10959 static void
10960 load_pair_u32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
10961 {
10962   unsigned rn = INSTR (14, 10);
10963   unsigned rd = INSTR (9, 5);
10964   unsigned rm = INSTR (4, 0);
10965   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
10966
10967   /* Treat this as unalloc to make sure we don't do it.  */
10968   if (rn == rm)
10969     HALT_UNALLOC;
10970
10971   offset <<= 2;
10972
10973   if (wb != Post)
10974     address += offset;
10975
10976   aarch64_set_reg_u64 (cpu, rm, SP_OK, aarch64_get_mem_u32 (cpu, address));
10977   aarch64_set_reg_u64 (cpu, rn, SP_OK, aarch64_get_mem_u32 (cpu, address + 4));
10978
10979   if (wb == Post)
10980     address += offset;
10981
10982   if (wb != NoWriteBack)
10983     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
10984 }
10985
10986 static void
10987 load_pair_s32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
10988 {
10989   unsigned rn = INSTR (14, 10);
10990   unsigned rd = INSTR (9, 5);
10991   unsigned rm = INSTR (4, 0);
10992   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
10993
10994   /* Treat this as unalloc to make sure we don't do it.  */
10995   if (rn == rm)
10996     HALT_UNALLOC;
10997
10998   offset <<= 2;
10999
11000   if (wb != Post)
11001     address += offset;
11002
11003   aarch64_set_reg_s64 (cpu, rm, SP_OK, aarch64_get_mem_s32 (cpu, address));
11004   aarch64_set_reg_s64 (cpu, rn, SP_OK, aarch64_get_mem_s32 (cpu, address + 4));
11005
11006   if (wb == Post)
11007     address += offset;
11008
11009   if (wb != NoWriteBack)
11010     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11011 }
11012
11013 static void
11014 load_pair_u64 (sim_cpu *cpu, int32_t offset, WriteBack wb)
11015 {
11016   unsigned rn = INSTR (14, 10);
11017   unsigned rd = INSTR (9, 5);
11018   unsigned rm = INSTR (4, 0);
11019   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11020
11021   /* Treat this as unalloc to make sure we don't do it.  */
11022   if (rn == rm)
11023     HALT_UNALLOC;
11024
11025   offset <<= 3;
11026
11027   if (wb != Post)
11028     address += offset;
11029
11030   aarch64_set_reg_u64 (cpu, rm, SP_OK, aarch64_get_mem_u64 (cpu, address));
11031   aarch64_set_reg_u64 (cpu, rn, SP_OK, aarch64_get_mem_u64 (cpu, address + 8));
11032
11033   if (wb == Post)
11034     address += offset;
11035
11036   if (wb != NoWriteBack)
11037     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11038 }
11039
11040 static void
11041 dex_load_store_pair_gr (sim_cpu *cpu)
11042 {
11043   /* instr[31,30] = size (10=> 64-bit, 01=> signed 32-bit, 00=> 32-bit)
11044      instr[29,25] = instruction encoding: 101_0
11045      instr[26]    = V : 1 if fp 0 if gp
11046      instr[24,23] = addressing mode (10=> offset, 01=> post, 11=> pre)
11047      instr[22]    = load/store (1=> load)
11048      instr[21,15] = signed, scaled, offset
11049      instr[14,10] = Rn
11050      instr[ 9, 5] = Rd
11051      instr[ 4, 0] = Rm.  */
11052
11053   uint32_t dispatch = ((INSTR (31, 30) << 3) | INSTR (24, 22));
11054   int32_t offset = simm32 (aarch64_get_instr (cpu), 21, 15);
11055
11056   switch (dispatch)
11057     {
11058     case 2: store_pair_u32 (cpu, offset, Post); return;
11059     case 3: load_pair_u32  (cpu, offset, Post); return;
11060     case 4: store_pair_u32 (cpu, offset, NoWriteBack); return;
11061     case 5: load_pair_u32  (cpu, offset, NoWriteBack); return;
11062     case 6: store_pair_u32 (cpu, offset, Pre); return;
11063     case 7: load_pair_u32  (cpu, offset, Pre); return;
11064
11065     case 11: load_pair_s32  (cpu, offset, Post); return;
11066     case 13: load_pair_s32  (cpu, offset, NoWriteBack); return;
11067     case 15: load_pair_s32  (cpu, offset, Pre); return;
11068
11069     case 18: store_pair_u64 (cpu, offset, Post); return;
11070     case 19: load_pair_u64  (cpu, offset, Post); return;
11071     case 20: store_pair_u64 (cpu, offset, NoWriteBack); return;
11072     case 21: load_pair_u64  (cpu, offset, NoWriteBack); return;
11073     case 22: store_pair_u64 (cpu, offset, Pre); return;
11074     case 23: load_pair_u64  (cpu, offset, Pre); return;
11075
11076     default:
11077       HALT_UNALLOC;
11078     }
11079 }
11080
11081 static void
11082 store_pair_float (sim_cpu *cpu, int32_t offset, WriteBack wb)
11083 {
11084   unsigned rn = INSTR (14, 10);
11085   unsigned rd = INSTR (9, 5);
11086   unsigned rm = INSTR (4, 0);
11087   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11088
11089   offset <<= 2;
11090
11091   if (wb != Post)
11092     address += offset;
11093
11094   aarch64_set_mem_u32 (cpu, address,     aarch64_get_vec_u32 (cpu, rm, 0));
11095   aarch64_set_mem_u32 (cpu, address + 4, aarch64_get_vec_u32 (cpu, rn, 0));
11096
11097   if (wb == Post)
11098     address += offset;
11099
11100   if (wb != NoWriteBack)
11101     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11102 }
11103
11104 static void
11105 store_pair_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11106 {
11107   unsigned rn = INSTR (14, 10);
11108   unsigned rd = INSTR (9, 5);
11109   unsigned rm = INSTR (4, 0);
11110   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11111
11112   offset <<= 3;
11113
11114   if (wb != Post)
11115     address += offset;
11116
11117   aarch64_set_mem_u64 (cpu, address,     aarch64_get_vec_u64 (cpu, rm, 0));
11118   aarch64_set_mem_u64 (cpu, address + 8, aarch64_get_vec_u64 (cpu, rn, 0));
11119
11120   if (wb == Post)
11121     address += offset;
11122
11123   if (wb != NoWriteBack)
11124     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11125 }
11126
11127 static void
11128 store_pair_long_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11129 {
11130   FRegister a;
11131   unsigned rn = INSTR (14, 10);
11132   unsigned rd = INSTR (9, 5);
11133   unsigned rm = INSTR (4, 0);
11134   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11135
11136   offset <<= 4;
11137
11138   if (wb != Post)
11139     address += offset;
11140
11141   aarch64_get_FP_long_double (cpu, rm, & a);
11142   aarch64_set_mem_long_double (cpu, address, a);
11143   aarch64_get_FP_long_double (cpu, rn, & a);
11144   aarch64_set_mem_long_double (cpu, address + 16, a);
11145
11146   if (wb == Post)
11147     address += offset;
11148
11149   if (wb != NoWriteBack)
11150     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11151 }
11152
11153 static void
11154 load_pair_float (sim_cpu *cpu, int32_t offset, WriteBack wb)
11155 {
11156   unsigned rn = INSTR (14, 10);
11157   unsigned rd = INSTR (9, 5);
11158   unsigned rm = INSTR (4, 0);
11159   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11160
11161   if (rm == rn)
11162     HALT_UNALLOC;
11163
11164   offset <<= 2;
11165
11166   if (wb != Post)
11167     address += offset;
11168
11169   aarch64_set_vec_u32 (cpu, rm, 0, aarch64_get_mem_u32 (cpu, address));
11170   aarch64_set_vec_u32 (cpu, rn, 0, aarch64_get_mem_u32 (cpu, address + 4));
11171
11172   if (wb == Post)
11173     address += offset;
11174
11175   if (wb != NoWriteBack)
11176     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11177 }
11178
11179 static void
11180 load_pair_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11181 {
11182   unsigned rn = INSTR (14, 10);
11183   unsigned rd = INSTR (9, 5);
11184   unsigned rm = INSTR (4, 0);
11185   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11186
11187   if (rm == rn)
11188     HALT_UNALLOC;
11189
11190   offset <<= 3;
11191
11192   if (wb != Post)
11193     address += offset;
11194
11195   aarch64_set_vec_u64 (cpu, rm, 0, aarch64_get_mem_u64 (cpu, address));
11196   aarch64_set_vec_u64 (cpu, rn, 0, aarch64_get_mem_u64 (cpu, address + 8));
11197
11198   if (wb == Post)
11199     address += offset;
11200
11201   if (wb != NoWriteBack)
11202     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11203 }
11204
11205 static void
11206 load_pair_long_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11207 {
11208   FRegister a;
11209   unsigned rn = INSTR (14, 10);
11210   unsigned rd = INSTR (9, 5);
11211   unsigned rm = INSTR (4, 0);
11212   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11213
11214   if (rm == rn)
11215     HALT_UNALLOC;
11216
11217   offset <<= 4;
11218
11219   if (wb != Post)
11220     address += offset;
11221
11222   aarch64_get_mem_long_double (cpu, address, & a);
11223   aarch64_set_FP_long_double (cpu, rm, a);
11224   aarch64_get_mem_long_double (cpu, address + 16, & a);
11225   aarch64_set_FP_long_double (cpu, rn, a);
11226
11227   if (wb == Post)
11228     address += offset;
11229
11230   if (wb != NoWriteBack)
11231     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11232 }
11233
11234 static void
11235 dex_load_store_pair_fp (sim_cpu *cpu)
11236 {
11237   /* instr[31,30] = size (10=> 128-bit, 01=> 64-bit, 00=> 32-bit)
11238      instr[29,25] = instruction encoding
11239      instr[24,23] = addressing mode (10=> offset, 01=> post, 11=> pre)
11240      instr[22]    = load/store (1=> load)
11241      instr[21,15] = signed, scaled, offset
11242      instr[14,10] = Rn
11243      instr[ 9, 5] = Rd
11244      instr[ 4, 0] = Rm  */
11245
11246   uint32_t dispatch = ((INSTR (31, 30) << 3) | INSTR (24, 22));
11247   int32_t offset = simm32 (aarch64_get_instr (cpu), 21, 15);
11248
11249   switch (dispatch)
11250     {
11251     case 2: store_pair_float (cpu, offset, Post); return;
11252     case 3: load_pair_float  (cpu, offset, Post); return;
11253     case 4: store_pair_float (cpu, offset, NoWriteBack); return;
11254     case 5: load_pair_float  (cpu, offset, NoWriteBack); return;
11255     case 6: store_pair_float (cpu, offset, Pre); return;
11256     case 7: load_pair_float  (cpu, offset, Pre); return;
11257
11258     case 10: store_pair_double (cpu, offset, Post); return;
11259     case 11: load_pair_double  (cpu, offset, Post); return;
11260     case 12: store_pair_double (cpu, offset, NoWriteBack); return;
11261     case 13: load_pair_double  (cpu, offset, NoWriteBack); return;
11262     case 14: store_pair_double (cpu, offset, Pre); return;
11263     case 15: load_pair_double  (cpu, offset, Pre); return;
11264
11265     case 18: store_pair_long_double (cpu, offset, Post); return;
11266     case 19: load_pair_long_double  (cpu, offset, Post); return;
11267     case 20: store_pair_long_double (cpu, offset, NoWriteBack); return;
11268     case 21: load_pair_long_double  (cpu, offset, NoWriteBack); return;
11269     case 22: store_pair_long_double (cpu, offset, Pre); return;
11270     case 23: load_pair_long_double  (cpu, offset, Pre); return;
11271
11272     default:
11273       HALT_UNALLOC;
11274     }
11275 }
11276
11277 static inline unsigned
11278 vec_reg (unsigned v, unsigned o)
11279 {
11280   return (v + o) & 0x3F;
11281 }
11282
11283 /* Load multiple N-element structures to N consecutive registers.  */
11284 static void
11285 vec_load (sim_cpu *cpu, uint64_t address, unsigned N)
11286 {
11287   int      all  = INSTR (30, 30);
11288   unsigned size = INSTR (11, 10);
11289   unsigned vd   = INSTR (4, 0);
11290   unsigned i;
11291
11292   switch (size)
11293     {
11294     case 0: /* 8-bit operations.  */
11295       if (all)
11296         for (i = 0; i < (16 * N); i++)
11297           aarch64_set_vec_u8 (cpu, vec_reg (vd, i >> 4), i & 15,
11298                               aarch64_get_mem_u8 (cpu, address + i));
11299       else
11300         for (i = 0; i < (8 * N); i++)
11301           aarch64_set_vec_u8 (cpu, vec_reg (vd, i >> 3), i & 7,
11302                               aarch64_get_mem_u8 (cpu, address + i));
11303       return;
11304
11305     case 1: /* 16-bit operations.  */
11306       if (all)
11307         for (i = 0; i < (8 * N); i++)
11308           aarch64_set_vec_u16 (cpu, vec_reg (vd, i >> 3), i & 7,
11309                                aarch64_get_mem_u16 (cpu, address + i * 2));
11310       else
11311         for (i = 0; i < (4 * N); i++)
11312           aarch64_set_vec_u16 (cpu, vec_reg (vd, i >> 2), i & 3,
11313                                aarch64_get_mem_u16 (cpu, address + i * 2));
11314       return;
11315
11316     case 2: /* 32-bit operations.  */
11317       if (all)
11318         for (i = 0; i < (4 * N); i++)
11319           aarch64_set_vec_u32 (cpu, vec_reg (vd, i >> 2), i & 3,
11320                                aarch64_get_mem_u32 (cpu, address + i * 4));
11321       else
11322         for (i = 0; i < (2 * N); i++)
11323           aarch64_set_vec_u32 (cpu, vec_reg (vd, i >> 1), i & 1,
11324                                aarch64_get_mem_u32 (cpu, address + i * 4));
11325       return;
11326
11327     case 3: /* 64-bit operations.  */
11328       if (all)
11329         for (i = 0; i < (2 * N); i++)
11330           aarch64_set_vec_u64 (cpu, vec_reg (vd, i >> 1), i & 1,
11331                                aarch64_get_mem_u64 (cpu, address + i * 8));
11332       else
11333         for (i = 0; i < N; i++)
11334           aarch64_set_vec_u64 (cpu, vec_reg (vd, i), 0,
11335                                aarch64_get_mem_u64 (cpu, address + i * 8));
11336       return;
11337     }
11338 }
11339
11340 /* LD4: load multiple 4-element to four consecutive registers.  */
11341 static void
11342 LD4 (sim_cpu *cpu, uint64_t address)
11343 {
11344   vec_load (cpu, address, 4);
11345 }
11346
11347 /* LD3: load multiple 3-element structures to three consecutive registers.  */
11348 static void
11349 LD3 (sim_cpu *cpu, uint64_t address)
11350 {
11351   vec_load (cpu, address, 3);
11352 }
11353
11354 /* LD2: load multiple 2-element structures to two consecutive registers.  */
11355 static void
11356 LD2 (sim_cpu *cpu, uint64_t address)
11357 {
11358   vec_load (cpu, address, 2);
11359 }
11360
11361 /* Load multiple 1-element structures into one register.  */
11362 static void
11363 LD1_1 (sim_cpu *cpu, uint64_t address)
11364 {
11365   int      all  = INSTR (30, 30);
11366   unsigned size = INSTR (11, 10);
11367   unsigned vd   = INSTR (4, 0);
11368   unsigned i;
11369
11370   switch (size)
11371     {
11372     case 0:
11373       /* LD1 {Vd.16b}, addr, #16 */
11374       /* LD1 {Vd.8b}, addr, #8 */
11375       for (i = 0; i < (all ? 16 : 8); i++)
11376         aarch64_set_vec_u8 (cpu, vd, i,
11377                             aarch64_get_mem_u8 (cpu, address + i));
11378       return;
11379
11380     case 1:
11381       /* LD1 {Vd.8h}, addr, #16 */
11382       /* LD1 {Vd.4h}, addr, #8 */
11383       for (i = 0; i < (all ? 8 : 4); i++)
11384         aarch64_set_vec_u16 (cpu, vd, i,
11385                              aarch64_get_mem_u16 (cpu, address + i * 2));
11386       return;
11387
11388     case 2:
11389       /* LD1 {Vd.4s}, addr, #16 */
11390       /* LD1 {Vd.2s}, addr, #8 */
11391       for (i = 0; i < (all ? 4 : 2); i++)
11392         aarch64_set_vec_u32 (cpu, vd, i,
11393                              aarch64_get_mem_u32 (cpu, address + i * 4));
11394       return;
11395
11396     case 3:
11397       /* LD1 {Vd.2d}, addr, #16 */
11398       /* LD1 {Vd.1d}, addr, #8 */
11399       for (i = 0; i < (all ? 2 : 1); i++)
11400         aarch64_set_vec_u64 (cpu, vd, i,
11401                              aarch64_get_mem_u64 (cpu, address + i * 8));
11402       return;
11403     }
11404 }
11405
11406 /* Load multiple 1-element structures into two registers.  */
11407 static void
11408 LD1_2 (sim_cpu *cpu, uint64_t address)
11409 {
11410   /* FIXME: This algorithm is *exactly* the same as the LD2 version.
11411      So why have two different instructions ?  There must be something
11412      wrong somewhere.  */
11413   vec_load (cpu, address, 2);
11414 }
11415
11416 /* Load multiple 1-element structures into three registers.  */
11417 static void
11418 LD1_3 (sim_cpu *cpu, uint64_t address)
11419 {
11420   /* FIXME: This algorithm is *exactly* the same as the LD3 version.
11421      So why have two different instructions ?  There must be something
11422      wrong somewhere.  */
11423   vec_load (cpu, address, 3);
11424 }
11425
11426 /* Load multiple 1-element structures into four registers.  */
11427 static void
11428 LD1_4 (sim_cpu *cpu, uint64_t address)
11429 {
11430   /* FIXME: This algorithm is *exactly* the same as the LD4 version.
11431      So why have two different instructions ?  There must be something
11432      wrong somewhere.  */
11433   vec_load (cpu, address, 4);
11434 }
11435
11436 /* Store multiple N-element structures to N consecutive registers.  */
11437 static void
11438 vec_store (sim_cpu *cpu, uint64_t address, unsigned N)
11439 {
11440   int      all  = INSTR (30, 30);
11441   unsigned size = INSTR (11, 10);
11442   unsigned vd   = INSTR (4, 0);
11443   unsigned i;
11444
11445   switch (size)
11446     {
11447     case 0: /* 8-bit operations.  */
11448       if (all)
11449         for (i = 0; i < (16 * N); i++)
11450           aarch64_set_mem_u8
11451             (cpu, address + i,
11452              aarch64_get_vec_u8 (cpu, vec_reg (vd, i >> 4), i & 15));
11453       else
11454         for (i = 0; i < (8 * N); i++)
11455           aarch64_set_mem_u8
11456             (cpu, address + i,
11457              aarch64_get_vec_u8 (cpu, vec_reg (vd, i >> 3), i & 7));
11458       return;
11459
11460     case 1: /* 16-bit operations.  */
11461       if (all)
11462         for (i = 0; i < (8 * N); i++)
11463           aarch64_set_mem_u16
11464             (cpu, address + i * 2,
11465              aarch64_get_vec_u16 (cpu, vec_reg (vd, i >> 3), i & 7));
11466       else
11467         for (i = 0; i < (4 * N); i++)
11468           aarch64_set_mem_u16
11469             (cpu, address + i * 2,
11470              aarch64_get_vec_u16 (cpu, vec_reg (vd, i >> 2), i & 3));
11471       return;
11472
11473     case 2: /* 32-bit operations.  */
11474       if (all)
11475         for (i = 0; i < (4 * N); i++)
11476           aarch64_set_mem_u32
11477             (cpu, address + i * 4,
11478              aarch64_get_vec_u32 (cpu, vec_reg (vd, i >> 2), i & 3));
11479       else
11480         for (i = 0; i < (2 * N); i++)
11481           aarch64_set_mem_u32
11482             (cpu, address + i * 4,
11483              aarch64_get_vec_u32 (cpu, vec_reg (vd, i >> 1), i & 1));
11484       return;
11485
11486     case 3: /* 64-bit operations.  */
11487       if (all)
11488         for (i = 0; i < (2 * N); i++)
11489           aarch64_set_mem_u64
11490             (cpu, address + i * 8,
11491              aarch64_get_vec_u64 (cpu, vec_reg (vd, i >> 1), i & 1));
11492       else
11493         for (i = 0; i < N; i++)
11494           aarch64_set_mem_u64
11495             (cpu, address + i * 8,
11496              aarch64_get_vec_u64 (cpu, vec_reg (vd, i), 0));
11497       return;
11498     }
11499 }
11500
11501 /* Store multiple 4-element structure to four consecutive registers.  */
11502 static void
11503 ST4 (sim_cpu *cpu, uint64_t address)
11504 {
11505   vec_store (cpu, address, 4);
11506 }
11507
11508 /* Store multiple 3-element structures to three consecutive registers.  */
11509 static void
11510 ST3 (sim_cpu *cpu, uint64_t address)
11511 {
11512   vec_store (cpu, address, 3);
11513 }
11514
11515 /* Store multiple 2-element structures to two consecutive registers.  */
11516 static void
11517 ST2 (sim_cpu *cpu, uint64_t address)
11518 {
11519   vec_store (cpu, address, 2);
11520 }
11521
11522 /* Store multiple 1-element structures into one register.  */
11523 static void
11524 ST1_1 (sim_cpu *cpu, uint64_t address)
11525 {
11526   int      all  = INSTR (30, 30);
11527   unsigned size = INSTR (11, 10);
11528   unsigned vd   = INSTR (4, 0);
11529   unsigned i;
11530
11531   switch (size)
11532     {
11533     case 0:
11534       for (i = 0; i < (all ? 16 : 8); i++)
11535         aarch64_set_mem_u8 (cpu, address + i,
11536                             aarch64_get_vec_u8 (cpu, vd, i));
11537       return;
11538
11539     case 1:
11540       for (i = 0; i < (all ? 8 : 4); i++)
11541         aarch64_set_mem_u16 (cpu, address + i * 2,
11542                              aarch64_get_vec_u16 (cpu, vd, i));
11543       return;
11544
11545     case 2:
11546       for (i = 0; i < (all ? 4 : 2); i++)
11547         aarch64_set_mem_u32 (cpu, address + i * 4,
11548                              aarch64_get_vec_u32 (cpu, vd, i));
11549       return;
11550
11551     case 3:
11552       for (i = 0; i < (all ? 2 : 1); i++)
11553         aarch64_set_mem_u64 (cpu, address + i * 8,
11554                              aarch64_get_vec_u64 (cpu, vd, i));
11555       return;
11556     }
11557 }
11558
11559 /* Store multiple 1-element structures into two registers.  */
11560 static void
11561 ST1_2 (sim_cpu *cpu, uint64_t address)
11562 {
11563   /* FIXME: This algorithm is *exactly* the same as the ST2 version.
11564      So why have two different instructions ?  There must be
11565      something wrong somewhere.  */
11566   vec_store (cpu, address, 2);
11567 }
11568
11569 /* Store multiple 1-element structures into three registers.  */
11570 static void
11571 ST1_3 (sim_cpu *cpu, uint64_t address)
11572 {
11573   /* FIXME: This algorithm is *exactly* the same as the ST3 version.
11574      So why have two different instructions ?  There must be
11575      something wrong somewhere.  */
11576   vec_store (cpu, address, 3);
11577 }
11578
11579 /* Store multiple 1-element structures into four registers.  */
11580 static void
11581 ST1_4 (sim_cpu *cpu, uint64_t address)
11582 {
11583   /* FIXME: This algorithm is *exactly* the same as the ST4 version.
11584      So why have two different instructions ?  There must be
11585      something wrong somewhere.  */
11586   vec_store (cpu, address, 4);
11587 }
11588
11589 #define LDn_STn_SINGLE_LANE_AND_SIZE()                          \
11590   do                                                            \
11591     {                                                           \
11592       switch (INSTR (15, 14))                                   \
11593         {                                                       \
11594         case 0:                                                 \
11595           lane = (full << 3) | (s << 2) | size;                 \
11596           size = 0;                                             \
11597           break;                                                \
11598                                                                 \
11599         case 1:                                                 \
11600           if ((size & 1) == 1)                                  \
11601             HALT_UNALLOC;                                       \
11602           lane = (full << 2) | (s << 1) | (size >> 1);          \
11603           size = 1;                                             \
11604           break;                                                \
11605                                                                 \
11606         case 2:                                                 \
11607           if ((size & 2) == 2)                                  \
11608             HALT_UNALLOC;                                       \
11609                                                                 \
11610           if ((size & 1) == 0)                                  \
11611             {                                                   \
11612               lane = (full << 1) | s;                           \
11613               size = 2;                                         \
11614             }                                                   \
11615           else                                                  \
11616             {                                                   \
11617               if (s)                                            \
11618                 HALT_UNALLOC;                                   \
11619               lane = full;                                      \
11620               size = 3;                                         \
11621             }                                                   \
11622           break;                                                \
11623                                                                 \
11624         default:                                                \
11625           HALT_UNALLOC;                                         \
11626         }                                                       \
11627     }                                                           \
11628   while (0)
11629
11630 /* Load single structure into one lane of N registers.  */
11631 static void
11632 do_vec_LDn_single (sim_cpu *cpu, uint64_t address)
11633 {
11634   /* instr[31]    = 0
11635      instr[30]    = element selector 0=>half, 1=>all elements
11636      instr[29,24] = 00 1101
11637      instr[23]    = 0=>simple, 1=>post
11638      instr[22]    = 1
11639      instr[21]    = width: LD1-or-LD3 (0) / LD2-or-LD4 (1)
11640      instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
11641                       11111 (immediate post inc)
11642      instr[15,13] = opcode
11643      instr[12]    = S, used for lane number
11644      instr[11,10] = size, also used for lane number
11645      instr[9,5]   = address
11646      instr[4,0]   = Vd  */
11647
11648   unsigned full = INSTR (30, 30);
11649   unsigned vd = INSTR (4, 0);
11650   unsigned size = INSTR (11, 10);
11651   unsigned s = INSTR (12, 12);
11652   int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
11653   int lane = 0;
11654   int i;
11655
11656   NYI_assert (29, 24, 0x0D);
11657   NYI_assert (22, 22, 1);
11658
11659   /* Compute the lane number first (using size), and then compute size.  */
11660   LDn_STn_SINGLE_LANE_AND_SIZE ();
11661
11662   for (i = 0; i < nregs; i++)
11663     switch (size)
11664       {
11665       case 0:
11666         {
11667           uint8_t val = aarch64_get_mem_u8 (cpu, address + i);
11668           aarch64_set_vec_u8 (cpu, vd + i, lane, val);
11669           break;
11670         }
11671
11672       case 1:
11673         {
11674           uint16_t val = aarch64_get_mem_u16 (cpu, address + (i * 2));
11675           aarch64_set_vec_u16 (cpu, vd + i, lane, val);
11676           break;
11677         }
11678
11679       case 2:
11680         {
11681           uint32_t val = aarch64_get_mem_u32 (cpu, address + (i * 4));
11682           aarch64_set_vec_u32 (cpu, vd + i, lane, val);
11683           break;
11684         }
11685
11686       case 3:
11687         {
11688           uint64_t val = aarch64_get_mem_u64 (cpu, address + (i * 8));
11689           aarch64_set_vec_u64 (cpu, vd + i, lane, val);
11690           break;
11691         }
11692       }
11693 }
11694
11695 /* Store single structure from one lane from N registers.  */
11696 static void
11697 do_vec_STn_single (sim_cpu *cpu, uint64_t address)
11698 {
11699   /* instr[31]    = 0
11700      instr[30]    = element selector 0=>half, 1=>all elements
11701      instr[29,24] = 00 1101
11702      instr[23]    = 0=>simple, 1=>post
11703      instr[22]    = 0
11704      instr[21]    = width: LD1-or-LD3 (0) / LD2-or-LD4 (1)
11705      instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
11706                       11111 (immediate post inc)
11707      instr[15,13] = opcode
11708      instr[12]    = S, used for lane number
11709      instr[11,10] = size, also used for lane number
11710      instr[9,5]   = address
11711      instr[4,0]   = Vd  */
11712
11713   unsigned full = INSTR (30, 30);
11714   unsigned vd = INSTR (4, 0);
11715   unsigned size = INSTR (11, 10);
11716   unsigned s = INSTR (12, 12);
11717   int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
11718   int lane = 0;
11719   int i;
11720
11721   NYI_assert (29, 24, 0x0D);
11722   NYI_assert (22, 22, 0);
11723
11724   /* Compute the lane number first (using size), and then compute size.  */
11725   LDn_STn_SINGLE_LANE_AND_SIZE ();
11726
11727   for (i = 0; i < nregs; i++)
11728     switch (size)
11729       {
11730       case 0:
11731         {
11732           uint8_t val = aarch64_get_vec_u8 (cpu, vd + i, lane);
11733           aarch64_set_mem_u8 (cpu, address + i, val);
11734           break;
11735         }
11736
11737       case 1:
11738         {
11739           uint16_t val = aarch64_get_vec_u16 (cpu, vd + i, lane);
11740           aarch64_set_mem_u16 (cpu, address + (i * 2), val);
11741           break;
11742         }
11743
11744       case 2:
11745         {
11746           uint32_t val = aarch64_get_vec_u32 (cpu, vd + i, lane);
11747           aarch64_set_mem_u32 (cpu, address + (i * 4), val);
11748           break;
11749         }
11750
11751       case 3:
11752         {
11753           uint64_t val = aarch64_get_vec_u64 (cpu, vd + i, lane);
11754           aarch64_set_mem_u64 (cpu, address + (i * 8), val);
11755           break;
11756         }
11757       }
11758 }
11759
11760 /* Load single structure into all lanes of N registers.  */
11761 static void
11762 do_vec_LDnR (sim_cpu *cpu, uint64_t address)
11763 {
11764   /* instr[31]    = 0
11765      instr[30]    = element selector 0=>half, 1=>all elements
11766      instr[29,24] = 00 1101
11767      instr[23]    = 0=>simple, 1=>post
11768      instr[22]    = 1
11769      instr[21]    = width: LD1R-or-LD3R (0) / LD2R-or-LD4R (1)
11770      instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
11771                       11111 (immediate post inc)
11772      instr[15,14] = 11
11773      instr[13]    = width: LD1R-or-LD2R (0) / LD3R-or-LD4R (1)
11774      instr[12]    = 0
11775      instr[11,10] = element size 00=> byte(b), 01=> half(h),
11776                                  10=> word(s), 11=> double(d)
11777      instr[9,5]   = address
11778      instr[4,0]   = Vd  */
11779
11780   unsigned full = INSTR (30, 30);
11781   unsigned vd = INSTR (4, 0);
11782   unsigned size = INSTR (11, 10);
11783   int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
11784   int i, n;
11785
11786   NYI_assert (29, 24, 0x0D);
11787   NYI_assert (22, 22, 1);
11788   NYI_assert (15, 14, 3);
11789   NYI_assert (12, 12, 0);
11790
11791   for (n = 0; n < nregs; n++)
11792     switch (size)
11793       {
11794       case 0:
11795         {
11796           uint8_t val = aarch64_get_mem_u8 (cpu, address + n);
11797           for (i = 0; i < (full ? 16 : 8); i++)
11798             aarch64_set_vec_u8 (cpu, vd + n, i, val);
11799           break;
11800         }
11801
11802       case 1:
11803         {
11804           uint16_t val = aarch64_get_mem_u16 (cpu, address + (n * 2));
11805           for (i = 0; i < (full ? 8 : 4); i++)
11806             aarch64_set_vec_u16 (cpu, vd + n, i, val);
11807           break;
11808         }
11809
11810       case 2:
11811         {
11812           uint32_t val = aarch64_get_mem_u32 (cpu, address + (n * 4));
11813           for (i = 0; i < (full ? 4 : 2); i++)
11814             aarch64_set_vec_u32 (cpu, vd + n, i, val);
11815           break;
11816         }
11817
11818       case 3:
11819         {
11820           uint64_t val = aarch64_get_mem_u64 (cpu, address + (n * 8));
11821           for (i = 0; i < (full ? 2 : 1); i++)
11822             aarch64_set_vec_u64 (cpu, vd + n, i, val);
11823           break;
11824         }
11825
11826       default:
11827         HALT_UNALLOC;
11828       }
11829 }
11830
11831 static void
11832 do_vec_load_store (sim_cpu *cpu)
11833 {
11834   /* {LD|ST}<N>   {Vd..Vd+N}, vaddr
11835
11836      instr[31]    = 0
11837      instr[30]    = element selector 0=>half, 1=>all elements
11838      instr[29,25] = 00110
11839      instr[24]    = 0=>multiple struct, 1=>single struct
11840      instr[23]    = 0=>simple, 1=>post
11841      instr[22]    = 0=>store, 1=>load
11842      instr[21]    = 0 (LDn) / small(0)-large(1) selector (LDnR)
11843      instr[20,16] = 00000 (simple), Vinc (reg-post-inc, no SP),
11844                     11111 (immediate post inc)
11845      instr[15,12] = elements and destinations.  eg for load:
11846                      0000=>LD4 => load multiple 4-element to
11847                      four consecutive registers
11848                      0100=>LD3 => load multiple 3-element to
11849                      three consecutive registers
11850                      1000=>LD2 => load multiple 2-element to
11851                      two consecutive registers
11852                      0010=>LD1 => load multiple 1-element to
11853                      four consecutive registers
11854                      0110=>LD1 => load multiple 1-element to
11855                      three consecutive registers
11856                      1010=>LD1 => load multiple 1-element to
11857                      two consecutive registers
11858                      0111=>LD1 => load multiple 1-element to
11859                      one register
11860                      1100=>LDR1,LDR2
11861                      1110=>LDR3,LDR4
11862      instr[11,10] = element size 00=> byte(b), 01=> half(h),
11863                                  10=> word(s), 11=> double(d)
11864      instr[9,5]   = Vn, can be SP
11865      instr[4,0]   = Vd  */
11866
11867   int single;
11868   int post;
11869   int load;
11870   unsigned vn;
11871   uint64_t address;
11872   int type;
11873
11874   if (INSTR (31, 31) != 0 || INSTR (29, 25) != 0x06)
11875     HALT_NYI;
11876
11877   single = INSTR (24, 24);
11878   post = INSTR (23, 23);
11879   load = INSTR (22, 22);
11880   type = INSTR (15, 12);
11881   vn = INSTR (9, 5);
11882   address = aarch64_get_reg_u64 (cpu, vn, SP_OK);
11883
11884   if (! single && INSTR (21, 21) != 0)
11885     HALT_UNALLOC;
11886
11887   if (post)
11888     {
11889       unsigned vm = INSTR (20, 16);
11890
11891       if (vm == R31)
11892         {
11893           unsigned sizeof_operation;
11894
11895           if (single)
11896             {
11897               if ((type >= 0) && (type <= 11))
11898                 {
11899                   int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
11900                   switch (INSTR (15, 14))
11901                     {
11902                     case 0:
11903                       sizeof_operation = nregs * 1;
11904                       break;
11905                     case 1:
11906                       sizeof_operation = nregs * 2;
11907                       break;
11908                     case 2:
11909                       if (INSTR (10, 10) == 0)
11910                         sizeof_operation = nregs * 4;
11911                       else
11912                         sizeof_operation = nregs * 8;
11913                       break;
11914                     default:
11915                       HALT_UNALLOC;
11916                     }
11917                 }
11918               else if (type == 0xC)
11919                 {
11920                   sizeof_operation = INSTR (21, 21) ? 2 : 1;
11921                   sizeof_operation <<= INSTR (11, 10);
11922                 }
11923               else if (type == 0xE)
11924                 {
11925                   sizeof_operation = INSTR (21, 21) ? 4 : 3;
11926                   sizeof_operation <<= INSTR (11, 10);
11927                 }
11928               else
11929                 HALT_UNALLOC;
11930             }
11931           else
11932             {
11933               switch (type)
11934                 {
11935                 case 0: sizeof_operation = 32; break;
11936                 case 4: sizeof_operation = 24; break;
11937                 case 8: sizeof_operation = 16; break;
11938
11939                 case 7:
11940                   /* One register, immediate offset variant.  */
11941                   sizeof_operation = 8;
11942                   break;
11943
11944                 case 10:
11945                   /* Two registers, immediate offset variant.  */
11946                   sizeof_operation = 16;
11947                   break;
11948
11949                 case 6:
11950                   /* Three registers, immediate offset variant.  */
11951                   sizeof_operation = 24;
11952                   break;
11953
11954                 case 2:
11955                   /* Four registers, immediate offset variant.  */
11956                   sizeof_operation = 32;
11957                   break;
11958
11959                 default:
11960                   HALT_UNALLOC;
11961                 }
11962
11963               if (INSTR (30, 30))
11964                 sizeof_operation *= 2;
11965             }
11966
11967           aarch64_set_reg_u64 (cpu, vn, SP_OK, address + sizeof_operation);
11968         }
11969       else
11970         aarch64_set_reg_u64 (cpu, vn, SP_OK,
11971                              address + aarch64_get_reg_u64 (cpu, vm, NO_SP));
11972     }
11973   else
11974     {
11975       NYI_assert (20, 16, 0);
11976     }
11977
11978   if (single)
11979     {
11980       if (load)
11981         {
11982           if ((type >= 0) && (type <= 11))
11983             do_vec_LDn_single (cpu, address);
11984           else if ((type == 0xC) || (type == 0xE))
11985             do_vec_LDnR (cpu, address);
11986           else
11987             HALT_UNALLOC;
11988           return;
11989         }
11990
11991       /* Stores.  */
11992       if ((type >= 0) && (type <= 11))
11993         {
11994           do_vec_STn_single (cpu, address);
11995           return;
11996         }
11997
11998       HALT_UNALLOC;
11999     }
12000
12001   if (load)
12002     {
12003       switch (type)
12004         {
12005         case 0:  LD4 (cpu, address); return;
12006         case 4:  LD3 (cpu, address); return;
12007         case 8:  LD2 (cpu, address); return;
12008         case 2:  LD1_4 (cpu, address); return;
12009         case 6:  LD1_3 (cpu, address); return;
12010         case 10: LD1_2 (cpu, address); return;
12011         case 7:  LD1_1 (cpu, address); return;
12012
12013         default:
12014           HALT_UNALLOC;
12015         }
12016     }
12017
12018   /* Stores.  */
12019   switch (type)
12020     {
12021     case 0:  ST4 (cpu, address); return;
12022     case 4:  ST3 (cpu, address); return;
12023     case 8:  ST2 (cpu, address); return;
12024     case 2:  ST1_4 (cpu, address); return;
12025     case 6:  ST1_3 (cpu, address); return;
12026     case 10: ST1_2 (cpu, address); return;
12027     case 7:  ST1_1 (cpu, address); return;
12028     default:
12029       HALT_UNALLOC;
12030     }
12031 }
12032
12033 static void
12034 dexLdSt (sim_cpu *cpu)
12035 {
12036   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
12037      assert  group == GROUP_LDST_0100 || group == GROUP_LDST_0110 ||
12038              group == GROUP_LDST_1100 || group == GROUP_LDST_1110
12039      bits [29,28:26] of a LS are the secondary dispatch vector.  */
12040   uint32_t group2 = dispatchLS (aarch64_get_instr (cpu));
12041
12042   switch (group2)
12043     {
12044     case LS_EXCL_000:
12045       dexLoadExclusive (cpu); return;
12046
12047     case LS_LIT_010:
12048     case LS_LIT_011:
12049       dexLoadLiteral (cpu); return;
12050
12051     case LS_OTHER_110:
12052     case LS_OTHER_111:
12053       dexLoadOther (cpu); return;
12054
12055     case LS_ADVSIMD_001:
12056       do_vec_load_store (cpu); return;
12057
12058     case LS_PAIR_100:
12059       dex_load_store_pair_gr (cpu); return;
12060
12061     case LS_PAIR_101:
12062       dex_load_store_pair_fp (cpu); return;
12063
12064     default:
12065       /* Should never reach here.  */
12066       HALT_NYI;
12067     }
12068 }
12069
12070 /* Specific decode and execute for group Data Processing Register.  */
12071
12072 static void
12073 dexLogicalShiftedRegister (sim_cpu *cpu)
12074 {
12075   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
12076      instr[30,29] = op
12077      instr[28:24] = 01010
12078      instr[23,22] = shift : 0 ==> LSL, 1 ==> LSR, 2 ==> ASR, 3 ==> ROR
12079      instr[21]    = N
12080      instr[20,16] = Rm
12081      instr[15,10] = count : must be 0xxxxx for 32 bit
12082      instr[9,5]   = Rn
12083      instr[4,0]   = Rd  */
12084
12085   uint32_t size      = INSTR (31, 31);
12086   Shift    shiftType = INSTR (23, 22);
12087   uint32_t count     = INSTR (15, 10);
12088
12089   /* 32 bit operations must have count[5] = 0.
12090      or else we have an UNALLOC.  */
12091   if (size == 0 && uimm (count, 5, 5))
12092     HALT_UNALLOC;
12093
12094   /* Dispatch on size:op:N.  */
12095   switch ((INSTR (31, 29) << 1) | INSTR (21, 21))
12096     {
12097     case 0: and32_shift  (cpu, shiftType, count); return;
12098     case 1: bic32_shift  (cpu, shiftType, count); return;
12099     case 2: orr32_shift  (cpu, shiftType, count); return;
12100     case 3: orn32_shift  (cpu, shiftType, count); return;
12101     case 4: eor32_shift  (cpu, shiftType, count); return;
12102     case 5: eon32_shift  (cpu, shiftType, count); return;
12103     case 6: ands32_shift (cpu, shiftType, count); return;
12104     case 7: bics32_shift (cpu, shiftType, count); return;
12105     case 8: and64_shift  (cpu, shiftType, count); return;
12106     case 9: bic64_shift  (cpu, shiftType, count); return;
12107     case 10:orr64_shift  (cpu, shiftType, count); return;
12108     case 11:orn64_shift  (cpu, shiftType, count); return;
12109     case 12:eor64_shift  (cpu, shiftType, count); return;
12110     case 13:eon64_shift  (cpu, shiftType, count); return;
12111     case 14:ands64_shift (cpu, shiftType, count); return;
12112     case 15:bics64_shift (cpu, shiftType, count); return;
12113     }
12114 }
12115
12116 /* 32 bit conditional select.  */
12117 static void
12118 csel32 (sim_cpu *cpu, CondCode cc)
12119 {
12120   unsigned rm = INSTR (20, 16);
12121   unsigned rn = INSTR (9, 5);
12122   unsigned rd = INSTR (4, 0);
12123
12124   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12125                        testConditionCode (cpu, cc)
12126                        ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12127                        : aarch64_get_reg_u32 (cpu, rm, NO_SP));
12128 }
12129
12130 /* 64 bit conditional select.  */
12131 static void
12132 csel64 (sim_cpu *cpu, CondCode cc)
12133 {
12134   unsigned rm = INSTR (20, 16);
12135   unsigned rn = INSTR (9, 5);
12136   unsigned rd = INSTR (4, 0);
12137
12138   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12139                        testConditionCode (cpu, cc)
12140                        ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12141                        : aarch64_get_reg_u64 (cpu, rm, NO_SP));
12142 }
12143
12144 /* 32 bit conditional increment.  */
12145 static void
12146 csinc32 (sim_cpu *cpu, CondCode cc)
12147 {
12148   unsigned rm = INSTR (20, 16);
12149   unsigned rn = INSTR (9, 5);
12150   unsigned rd = INSTR (4, 0);
12151
12152   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12153                        testConditionCode (cpu, cc)
12154                        ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12155                        : aarch64_get_reg_u32 (cpu, rm, NO_SP) + 1);
12156 }
12157
12158 /* 64 bit conditional increment.  */
12159 static void
12160 csinc64 (sim_cpu *cpu, CondCode cc)
12161 {
12162   unsigned rm = INSTR (20, 16);
12163   unsigned rn = INSTR (9, 5);
12164   unsigned rd = INSTR (4, 0);
12165
12166   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12167                        testConditionCode (cpu, cc)
12168                        ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12169                        : aarch64_get_reg_u64 (cpu, rm, NO_SP) + 1);
12170 }
12171
12172 /* 32 bit conditional invert.  */
12173 static void
12174 csinv32 (sim_cpu *cpu, CondCode cc)
12175 {
12176   unsigned rm = INSTR (20, 16);
12177   unsigned rn = INSTR (9, 5);
12178   unsigned rd = INSTR (4, 0);
12179
12180   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12181                        testConditionCode (cpu, cc)
12182                        ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12183                        : ~ aarch64_get_reg_u32 (cpu, rm, NO_SP));
12184 }
12185
12186 /* 64 bit conditional invert.  */
12187 static void
12188 csinv64 (sim_cpu *cpu, CondCode cc)
12189 {
12190   unsigned rm = INSTR (20, 16);
12191   unsigned rn = INSTR (9, 5);
12192   unsigned rd = INSTR (4, 0);
12193
12194   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12195                        testConditionCode (cpu, cc)
12196                        ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12197                        : ~ aarch64_get_reg_u64 (cpu, rm, NO_SP));
12198 }
12199
12200 /* 32 bit conditional negate.  */
12201 static void
12202 csneg32 (sim_cpu *cpu, CondCode cc)
12203 {
12204   unsigned rm = INSTR (20, 16);
12205   unsigned rn = INSTR (9, 5);
12206   unsigned rd = INSTR (4, 0);
12207
12208   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12209                        testConditionCode (cpu, cc)
12210                        ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12211                        : - aarch64_get_reg_u32 (cpu, rm, NO_SP));
12212 }
12213
12214 /* 64 bit conditional negate.  */
12215 static void
12216 csneg64 (sim_cpu *cpu, CondCode cc)
12217 {
12218   unsigned rm = INSTR (20, 16);
12219   unsigned rn = INSTR (9, 5);
12220   unsigned rd = INSTR (4, 0);
12221
12222   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12223                        testConditionCode (cpu, cc)
12224                        ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12225                        : - aarch64_get_reg_u64 (cpu, rm, NO_SP));
12226 }
12227
12228 static void
12229 dexCondSelect (sim_cpu *cpu)
12230 {
12231   /* instr[28,21] = 11011011
12232      instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
12233      instr[30:11,10] = op : 000 ==> CSEL, 001 ==> CSINC,
12234                             100 ==> CSINV, 101 ==> CSNEG,
12235                             _1_ ==> UNALLOC
12236      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC
12237      instr[15,12] = cond
12238      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC  */
12239
12240   CondCode cc = INSTR (15, 12);
12241   uint32_t S = INSTR (29, 29);
12242   uint32_t op2 = INSTR (11, 10);
12243
12244   if (S == 1)
12245     HALT_UNALLOC;
12246
12247   if (op2 & 0x2)
12248     HALT_UNALLOC;
12249
12250   switch ((INSTR (31, 30) << 1) | op2)
12251     {
12252     case 0: csel32  (cpu, cc); return;
12253     case 1: csinc32 (cpu, cc); return;
12254     case 2: csinv32 (cpu, cc); return;
12255     case 3: csneg32 (cpu, cc); return;
12256     case 4: csel64  (cpu, cc); return;
12257     case 5: csinc64 (cpu, cc); return;
12258     case 6: csinv64 (cpu, cc); return;
12259     case 7: csneg64 (cpu, cc); return;
12260     }
12261 }
12262
12263 /* Some helpers for counting leading 1 or 0 bits.  */
12264
12265 /* Counts the number of leading bits which are the same
12266    in a 32 bit value in the range 1 to 32.  */
12267 static uint32_t
12268 leading32 (uint32_t value)
12269 {
12270   int32_t mask= 0xffff0000;
12271   uint32_t count= 16; /* Counts number of bits set in mask.  */
12272   uint32_t lo = 1;    /* Lower bound for number of sign bits.  */
12273   uint32_t hi = 32;   /* Upper bound for number of sign bits.  */
12274
12275   while (lo + 1 < hi)
12276     {
12277       int32_t test = (value & mask);
12278
12279       if (test == 0 || test == mask)
12280         {
12281           lo = count;
12282           count = (lo + hi) / 2;
12283           mask >>= (count - lo);
12284         }
12285       else
12286         {
12287           hi = count;
12288           count = (lo + hi) / 2;
12289           mask <<= hi - count;
12290         }
12291     }
12292
12293   if (lo != hi)
12294     {
12295       int32_t test;
12296
12297       mask >>= 1;
12298       test = (value & mask);
12299
12300       if (test == 0 || test == mask)
12301         count = hi;
12302       else
12303         count = lo;
12304     }
12305
12306   return count;
12307 }
12308
12309 /* Counts the number of leading bits which are the same
12310    in a 64 bit value in the range 1 to 64.  */
12311 static uint64_t
12312 leading64 (uint64_t value)
12313 {
12314   int64_t mask= 0xffffffff00000000LL;
12315   uint64_t count = 32; /* Counts number of bits set in mask.  */
12316   uint64_t lo = 1;     /* Lower bound for number of sign bits.  */
12317   uint64_t hi = 64;    /* Upper bound for number of sign bits.  */
12318
12319   while (lo + 1 < hi)
12320     {
12321       int64_t test = (value & mask);
12322
12323       if (test == 0 || test == mask)
12324         {
12325           lo = count;
12326           count = (lo + hi) / 2;
12327           mask >>= (count - lo);
12328         }
12329       else
12330         {
12331           hi = count;
12332           count = (lo + hi) / 2;
12333           mask <<= hi - count;
12334         }
12335     }
12336
12337   if (lo != hi)
12338     {
12339       int64_t test;
12340
12341       mask >>= 1;
12342       test = (value & mask);
12343
12344       if (test == 0 || test == mask)
12345         count = hi;
12346       else
12347         count = lo;
12348     }
12349
12350   return count;
12351 }
12352
12353 /* Bit operations.  */
12354 /* N.B register args may not be SP.  */
12355
12356 /* 32 bit count leading sign bits.  */
12357 static void
12358 cls32 (sim_cpu *cpu)
12359 {
12360   unsigned rn = INSTR (9, 5);
12361   unsigned rd = INSTR (4, 0);
12362
12363   /* N.B. the result needs to exclude the leading bit.  */
12364   aarch64_set_reg_u64
12365     (cpu, rd, NO_SP, leading32 (aarch64_get_reg_u32 (cpu, rn, NO_SP)) - 1);
12366 }
12367
12368 /* 64 bit count leading sign bits.  */
12369 static void
12370 cls64 (sim_cpu *cpu)
12371 {
12372   unsigned rn = INSTR (9, 5);
12373   unsigned rd = INSTR (4, 0);
12374
12375   /* N.B. the result needs to exclude the leading bit.  */
12376   aarch64_set_reg_u64
12377     (cpu, rd, NO_SP, leading64 (aarch64_get_reg_u64 (cpu, rn, NO_SP)) - 1);
12378 }
12379
12380 /* 32 bit count leading zero bits.  */
12381 static void
12382 clz32 (sim_cpu *cpu)
12383 {
12384   unsigned rn = INSTR (9, 5);
12385   unsigned rd = INSTR (4, 0);
12386   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12387
12388   /* if the sign (top) bit is set then the count is 0.  */
12389   if (pick32 (value, 31, 31))
12390     aarch64_set_reg_u64 (cpu, rd, NO_SP, 0L);
12391   else
12392     aarch64_set_reg_u64 (cpu, rd, NO_SP, leading32 (value));
12393 }
12394
12395 /* 64 bit count leading zero bits.  */
12396 static void
12397 clz64 (sim_cpu *cpu)
12398 {
12399   unsigned rn = INSTR (9, 5);
12400   unsigned rd = INSTR (4, 0);
12401   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12402
12403   /* if the sign (top) bit is set then the count is 0.  */
12404   if (pick64 (value, 63, 63))
12405     aarch64_set_reg_u64 (cpu, rd, NO_SP, 0L);
12406   else
12407     aarch64_set_reg_u64 (cpu, rd, NO_SP, leading64 (value));
12408 }
12409
12410 /* 32 bit reverse bits.  */
12411 static void
12412 rbit32 (sim_cpu *cpu)
12413 {
12414   unsigned rn = INSTR (9, 5);
12415   unsigned rd = INSTR (4, 0);
12416   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12417   uint32_t result = 0;
12418   int i;
12419
12420   for (i = 0; i < 32; i++)
12421     {
12422       result <<= 1;
12423       result |= (value & 1);
12424       value >>= 1;
12425     }
12426   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12427 }
12428
12429 /* 64 bit reverse bits.  */
12430 static void
12431 rbit64 (sim_cpu *cpu)
12432 {
12433   unsigned rn = INSTR (9, 5);
12434   unsigned rd = INSTR (4, 0);
12435   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12436   uint64_t result = 0;
12437   int i;
12438
12439   for (i = 0; i < 64; i++)
12440     {
12441       result <<= 1;
12442       result |= (value & 1UL);
12443       value >>= 1;
12444     }
12445   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12446 }
12447
12448 /* 32 bit reverse bytes.  */
12449 static void
12450 rev32 (sim_cpu *cpu)
12451 {
12452   unsigned rn = INSTR (9, 5);
12453   unsigned rd = INSTR (4, 0);
12454   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12455   uint32_t result = 0;
12456   int i;
12457
12458   for (i = 0; i < 4; i++)
12459     {
12460       result <<= 8;
12461       result |= (value & 0xff);
12462       value >>= 8;
12463     }
12464   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12465 }
12466
12467 /* 64 bit reverse bytes.  */
12468 static void
12469 rev64 (sim_cpu *cpu)
12470 {
12471   unsigned rn = INSTR (9, 5);
12472   unsigned rd = INSTR (4, 0);
12473   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12474   uint64_t result = 0;
12475   int i;
12476
12477   for (i = 0; i < 8; i++)
12478     {
12479       result <<= 8;
12480       result |= (value & 0xffULL);
12481       value >>= 8;
12482     }
12483   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12484 }
12485
12486 /* 32 bit reverse shorts.  */
12487 /* N.B.this reverses the order of the bytes in each half word.  */
12488 static void
12489 revh32 (sim_cpu *cpu)
12490 {
12491   unsigned rn = INSTR (9, 5);
12492   unsigned rd = INSTR (4, 0);
12493   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12494   uint32_t result = 0;
12495   int i;
12496
12497   for (i = 0; i < 2; i++)
12498     {
12499       result <<= 8;
12500       result |= (value & 0x00ff00ff);
12501       value >>= 8;
12502     }
12503   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12504 }
12505
12506 /* 64 bit reverse shorts.  */
12507 /* N.B.this reverses the order of the bytes in each half word.  */
12508 static void
12509 revh64 (sim_cpu *cpu)
12510 {
12511   unsigned rn = INSTR (9, 5);
12512   unsigned rd = INSTR (4, 0);
12513   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12514   uint64_t result = 0;
12515   int i;
12516
12517   for (i = 0; i < 2; i++)
12518     {
12519       result <<= 8;
12520       result |= (value & 0x00ff00ff00ff00ffULL);
12521       value >>= 8;
12522     }
12523   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12524 }
12525
12526 static void
12527 dexDataProc1Source (sim_cpu *cpu)
12528 {
12529   /* instr[30]    = 1
12530      instr[28,21] = 111010110
12531      instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
12532      instr[29]    = S : 0 ==> ok, 1 ==> UNALLOC
12533      instr[20,16] = opcode2 : 00000 ==> ok, ow ==> UNALLOC
12534      instr[15,10] = opcode : 000000 ==> RBIT, 000001 ==> REV16,
12535                              000010 ==> REV, 000011 ==> UNALLOC
12536                              000100 ==> CLZ, 000101 ==> CLS
12537                              ow ==> UNALLOC
12538      instr[9,5]   = rn : may not be SP
12539      instr[4,0]   = rd : may not be SP.  */
12540
12541   uint32_t S = INSTR (29, 29);
12542   uint32_t opcode2 = INSTR (20, 16);
12543   uint32_t opcode = INSTR (15, 10);
12544   uint32_t dispatch = ((INSTR (31, 31) << 3) | opcode);
12545
12546   if (S == 1)
12547     HALT_UNALLOC;
12548
12549   if (opcode2 != 0)
12550     HALT_UNALLOC;
12551
12552   if (opcode & 0x38)
12553     HALT_UNALLOC;
12554
12555   switch (dispatch)
12556     {
12557     case 0: rbit32 (cpu); return;
12558     case 1: revh32 (cpu); return;
12559     case 2: rev32 (cpu); return;
12560     case 4: clz32 (cpu); return;
12561     case 5: cls32 (cpu); return;
12562     case 8: rbit64 (cpu); return;
12563     case 9: revh64 (cpu); return;
12564     case 10:rev32 (cpu); return;
12565     case 11:rev64 (cpu); return;
12566     case 12:clz64 (cpu); return;
12567     case 13:cls64 (cpu); return;
12568     default: HALT_UNALLOC;
12569     }
12570 }
12571
12572 /* Variable shift.
12573    Shifts by count supplied in register.
12574    N.B register args may not be SP.
12575    These all use the shifted auxiliary function for
12576    simplicity and clarity.  Writing the actual shift
12577    inline would avoid a branch and so be faster but
12578    would also necessitate getting signs right.  */
12579
12580 /* 32 bit arithmetic shift right.  */
12581 static void
12582 asrv32 (sim_cpu *cpu)
12583 {
12584   unsigned rm = INSTR (20, 16);
12585   unsigned rn = INSTR (9, 5);
12586   unsigned rd = INSTR (4, 0);
12587
12588   aarch64_set_reg_u64
12589     (cpu, rd, NO_SP,
12590      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), ASR,
12591                 (aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12592 }
12593
12594 /* 64 bit arithmetic shift right.  */
12595 static void
12596 asrv64 (sim_cpu *cpu)
12597 {
12598   unsigned rm = INSTR (20, 16);
12599   unsigned rn = INSTR (9, 5);
12600   unsigned rd = INSTR (4, 0);
12601
12602   aarch64_set_reg_u64
12603     (cpu, rd, NO_SP,
12604      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), ASR,
12605                 (aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12606 }
12607
12608 /* 32 bit logical shift left.  */
12609 static void
12610 lslv32 (sim_cpu *cpu)
12611 {
12612   unsigned rm = INSTR (20, 16);
12613   unsigned rn = INSTR (9, 5);
12614   unsigned rd = INSTR (4, 0);
12615
12616   aarch64_set_reg_u64
12617     (cpu, rd, NO_SP,
12618      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), LSL,
12619                 (aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12620 }
12621
12622 /* 64 bit arithmetic shift left.  */
12623 static void
12624 lslv64 (sim_cpu *cpu)
12625 {
12626   unsigned rm = INSTR (20, 16);
12627   unsigned rn = INSTR (9, 5);
12628   unsigned rd = INSTR (4, 0);
12629
12630   aarch64_set_reg_u64
12631     (cpu, rd, NO_SP,
12632      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), LSL,
12633                 (aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12634 }
12635
12636 /* 32 bit logical shift right.  */
12637 static void
12638 lsrv32 (sim_cpu *cpu)
12639 {
12640   unsigned rm = INSTR (20, 16);
12641   unsigned rn = INSTR (9, 5);
12642   unsigned rd = INSTR (4, 0);
12643
12644   aarch64_set_reg_u64
12645     (cpu, rd, NO_SP,
12646      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), LSR,
12647                 (aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12648 }
12649
12650 /* 64 bit logical shift right.  */
12651 static void
12652 lsrv64 (sim_cpu *cpu)
12653 {
12654   unsigned rm = INSTR (20, 16);
12655   unsigned rn = INSTR (9, 5);
12656   unsigned rd = INSTR (4, 0);
12657
12658   aarch64_set_reg_u64
12659     (cpu, rd, NO_SP,
12660      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), LSR,
12661                 (aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12662 }
12663
12664 /* 32 bit rotate right.  */
12665 static void
12666 rorv32 (sim_cpu *cpu)
12667 {
12668   unsigned rm = INSTR (20, 16);
12669   unsigned rn = INSTR (9, 5);
12670   unsigned rd = INSTR (4, 0);
12671
12672   aarch64_set_reg_u64
12673     (cpu, rd, NO_SP,
12674      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), ROR,
12675                 (aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12676 }
12677
12678 /* 64 bit rotate right.  */
12679 static void
12680 rorv64 (sim_cpu *cpu)
12681 {
12682   unsigned rm = INSTR (20, 16);
12683   unsigned rn = INSTR (9, 5);
12684   unsigned rd = INSTR (4, 0);
12685
12686   aarch64_set_reg_u64
12687     (cpu, rd, NO_SP,
12688      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), ROR,
12689                 (aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12690 }
12691
12692
12693 /* divide.  */
12694
12695 /* 32 bit signed divide.  */
12696 static void
12697 cpuiv32 (sim_cpu *cpu)
12698 {
12699   unsigned rm = INSTR (20, 16);
12700   unsigned rn = INSTR (9, 5);
12701   unsigned rd = INSTR (4, 0);
12702   /* N.B. the pseudo-code does the divide using 64 bit data.  */
12703   /* TODO : check that this rounds towards zero as required.  */
12704   int64_t dividend = aarch64_get_reg_s32 (cpu, rn, NO_SP);
12705   int64_t divisor = aarch64_get_reg_s32 (cpu, rm, NO_SP);
12706
12707   aarch64_set_reg_s64 (cpu, rd, NO_SP,
12708                        divisor ? ((int32_t) (dividend / divisor)) : 0);
12709 }
12710
12711 /* 64 bit signed divide.  */
12712 static void
12713 cpuiv64 (sim_cpu *cpu)
12714 {
12715   unsigned rm = INSTR (20, 16);
12716   unsigned rn = INSTR (9, 5);
12717   unsigned rd = INSTR (4, 0);
12718
12719   /* TODO : check that this rounds towards zero as required.  */
12720   int64_t divisor = aarch64_get_reg_s64 (cpu, rm, NO_SP);
12721
12722   aarch64_set_reg_s64
12723     (cpu, rd, NO_SP,
12724      divisor ? (aarch64_get_reg_s64 (cpu, rn, NO_SP) / divisor) : 0);
12725 }
12726
12727 /* 32 bit unsigned divide.  */
12728 static void
12729 udiv32 (sim_cpu *cpu)
12730 {
12731   unsigned rm = INSTR (20, 16);
12732   unsigned rn = INSTR (9, 5);
12733   unsigned rd = INSTR (4, 0);
12734
12735   /* N.B. the pseudo-code does the divide using 64 bit data.  */
12736   uint64_t dividend = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12737   uint64_t divisor  = aarch64_get_reg_u32 (cpu, rm, NO_SP);
12738
12739   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12740                        divisor ? (uint32_t) (dividend / divisor) : 0);
12741 }
12742
12743 /* 64 bit unsigned divide.  */
12744 static void
12745 udiv64 (sim_cpu *cpu)
12746 {
12747   unsigned rm = INSTR (20, 16);
12748   unsigned rn = INSTR (9, 5);
12749   unsigned rd = INSTR (4, 0);
12750
12751   /* TODO : check that this rounds towards zero as required.  */
12752   uint64_t divisor = aarch64_get_reg_u64 (cpu, rm, NO_SP);
12753
12754   aarch64_set_reg_u64
12755     (cpu, rd, NO_SP,
12756      divisor ? (aarch64_get_reg_u64 (cpu, rn, NO_SP) / divisor) : 0);
12757 }
12758
12759 static void
12760 dexDataProc2Source (sim_cpu *cpu)
12761 {
12762   /* assert instr[30] == 0
12763      instr[28,21] == 11010110
12764      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
12765      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC
12766      instr[15,10] = opcode : 000010 ==> UDIV, 000011 ==> CPUIV,
12767                              001000 ==> LSLV, 001001 ==> LSRV
12768                              001010 ==> ASRV, 001011 ==> RORV
12769                              ow ==> UNALLOC.  */
12770
12771   uint32_t dispatch;
12772   uint32_t S = INSTR (29, 29);
12773   uint32_t opcode = INSTR (15, 10);
12774
12775   if (S == 1)
12776     HALT_UNALLOC;
12777
12778   if (opcode & 0x34)
12779     HALT_UNALLOC;
12780
12781   dispatch = (  (INSTR (31, 31) << 3)
12782               | (uimm (opcode, 3, 3) << 2)
12783               |  uimm (opcode, 1, 0));
12784   switch (dispatch)
12785     {
12786     case 2:  udiv32 (cpu); return;
12787     case 3:  cpuiv32 (cpu); return;
12788     case 4:  lslv32 (cpu); return;
12789     case 5:  lsrv32 (cpu); return;
12790     case 6:  asrv32 (cpu); return;
12791     case 7:  rorv32 (cpu); return;
12792     case 10: udiv64 (cpu); return;
12793     case 11: cpuiv64 (cpu); return;
12794     case 12: lslv64 (cpu); return;
12795     case 13: lsrv64 (cpu); return;
12796     case 14: asrv64 (cpu); return;
12797     case 15: rorv64 (cpu); return;
12798     default: HALT_UNALLOC;
12799     }
12800 }
12801
12802
12803 /* Multiply.  */
12804
12805 /* 32 bit multiply and add.  */
12806 static void
12807 madd32 (sim_cpu *cpu)
12808 {
12809   unsigned rm = INSTR (20, 16);
12810   unsigned ra = INSTR (14, 10);
12811   unsigned rn = INSTR (9, 5);
12812   unsigned rd = INSTR (4, 0);
12813
12814   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12815   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12816                        aarch64_get_reg_u32 (cpu, ra, NO_SP)
12817                        + aarch64_get_reg_u32 (cpu, rn, NO_SP)
12818                        * aarch64_get_reg_u32 (cpu, rm, NO_SP));
12819 }
12820
12821 /* 64 bit multiply and add.  */
12822 static void
12823 madd64 (sim_cpu *cpu)
12824 {
12825   unsigned rm = INSTR (20, 16);
12826   unsigned ra = INSTR (14, 10);
12827   unsigned rn = INSTR (9, 5);
12828   unsigned rd = INSTR (4, 0);
12829
12830   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12831   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12832                        aarch64_get_reg_u64 (cpu, ra, NO_SP)
12833                        + (aarch64_get_reg_u64 (cpu, rn, NO_SP)
12834                           * aarch64_get_reg_u64 (cpu, rm, NO_SP)));
12835 }
12836
12837 /* 32 bit multiply and sub.  */
12838 static void
12839 msub32 (sim_cpu *cpu)
12840 {
12841   unsigned rm = INSTR (20, 16);
12842   unsigned ra = INSTR (14, 10);
12843   unsigned rn = INSTR (9, 5);
12844   unsigned rd = INSTR (4, 0);
12845
12846   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12847   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12848                        aarch64_get_reg_u32 (cpu, ra, NO_SP)
12849                        - aarch64_get_reg_u32 (cpu, rn, NO_SP)
12850                        * aarch64_get_reg_u32 (cpu, rm, NO_SP));
12851 }
12852
12853 /* 64 bit multiply and sub.  */
12854 static void
12855 msub64 (sim_cpu *cpu)
12856 {
12857   unsigned rm = INSTR (20, 16);
12858   unsigned ra = INSTR (14, 10);
12859   unsigned rn = INSTR (9, 5);
12860   unsigned rd = INSTR (4, 0);
12861
12862   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12863   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12864                        aarch64_get_reg_u64 (cpu, ra, NO_SP)
12865                        - aarch64_get_reg_u64 (cpu, rn, NO_SP)
12866                        * aarch64_get_reg_u64 (cpu, rm, NO_SP));
12867 }
12868
12869 /* Signed multiply add long -- source, source2 : 32 bit, source3 : 64 bit.  */
12870 static void
12871 smaddl (sim_cpu *cpu)
12872 {
12873   unsigned rm = INSTR (20, 16);
12874   unsigned ra = INSTR (14, 10);
12875   unsigned rn = INSTR (9, 5);
12876   unsigned rd = INSTR (4, 0);
12877
12878   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
12879      obtain a 64 bit product.  */
12880   aarch64_set_reg_s64
12881     (cpu, rd, NO_SP,
12882      aarch64_get_reg_s64 (cpu, ra, NO_SP)
12883      + ((int64_t) aarch64_get_reg_s32 (cpu, rn, NO_SP))
12884      * ((int64_t) aarch64_get_reg_s32 (cpu, rm, NO_SP)));
12885 }
12886
12887 /* Signed multiply sub long -- source, source2 : 32 bit, source3 : 64 bit.  */
12888 static void
12889 smsubl (sim_cpu *cpu)
12890 {
12891   unsigned rm = INSTR (20, 16);
12892   unsigned ra = INSTR (14, 10);
12893   unsigned rn = INSTR (9, 5);
12894   unsigned rd = INSTR (4, 0);
12895
12896   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
12897      obtain a 64 bit product.  */
12898   aarch64_set_reg_s64
12899     (cpu, rd, NO_SP,
12900      aarch64_get_reg_s64 (cpu, ra, NO_SP)
12901      - ((int64_t) aarch64_get_reg_s32 (cpu, rn, NO_SP))
12902      * ((int64_t) aarch64_get_reg_s32 (cpu, rm, NO_SP)));
12903 }
12904
12905 /* Integer Multiply/Divide.  */
12906
12907 /* First some macros and a helper function.  */
12908 /* Macros to test or access elements of 64 bit words.  */
12909
12910 /* Mask used to access lo 32 bits of 64 bit unsigned int.  */
12911 #define LOW_WORD_MASK ((1ULL << 32) - 1)
12912 /* Return the lo 32 bit word of a 64 bit unsigned int as a 64 bit unsigned int.  */
12913 #define lowWordToU64(_value_u64) ((_value_u64) & LOW_WORD_MASK)
12914 /* Return the hi 32 bit word of a 64 bit unsigned int as a 64 bit unsigned int.  */
12915 #define highWordToU64(_value_u64) ((_value_u64) >> 32)
12916
12917 /* Offset of sign bit in 64 bit signed integger.  */
12918 #define SIGN_SHIFT_U64 63
12919 /* The sign bit itself -- also identifies the minimum negative int value.  */
12920 #define SIGN_BIT_U64 (1UL << SIGN_SHIFT_U64)
12921 /* Return true if a 64 bit signed int presented as an unsigned int is the
12922    most negative value.  */
12923 #define isMinimumU64(_value_u64) ((_value_u64) == SIGN_BIT_U64)
12924 /* Return true (non-zero) if a 64 bit signed int presented as an unsigned
12925    int has its sign bit set to false.  */
12926 #define isSignSetU64(_value_u64) ((_value_u64) & SIGN_BIT_U64)
12927 /* Return 1L or -1L according to whether a 64 bit signed int presented as
12928    an unsigned int has its sign bit set or not.  */
12929 #define signOfU64(_value_u64) (1L + (((value_u64) >> SIGN_SHIFT_U64) * -2L)
12930 /* Clear the sign bit of a 64 bit signed int presented as an unsigned int.  */
12931 #define clearSignU64(_value_u64) ((_value_u64) &= ~SIGN_BIT_U64)
12932
12933 /* Multiply two 64 bit ints and return.
12934    the hi 64 bits of the 128 bit product.  */
12935
12936 static uint64_t
12937 mul64hi (uint64_t value1, uint64_t value2)
12938 {
12939   uint64_t resultmid1;
12940   uint64_t result;
12941   uint64_t value1_lo = lowWordToU64 (value1);
12942   uint64_t value1_hi = highWordToU64 (value1) ;
12943   uint64_t value2_lo = lowWordToU64 (value2);
12944   uint64_t value2_hi = highWordToU64 (value2);
12945
12946   /* Cross-multiply and collect results.  */
12947   uint64_t xproductlo = value1_lo * value2_lo;
12948   uint64_t xproductmid1 = value1_lo * value2_hi;
12949   uint64_t xproductmid2 = value1_hi * value2_lo;
12950   uint64_t xproducthi = value1_hi * value2_hi;
12951   uint64_t carry = 0;
12952   /* Start accumulating 64 bit results.  */
12953   /* Drop bottom half of lowest cross-product.  */
12954   uint64_t resultmid = xproductlo >> 32;
12955   /* Add in middle products.  */
12956   resultmid = resultmid + xproductmid1;
12957
12958   /* Check for overflow.  */
12959   if (resultmid < xproductmid1)
12960     /* Carry over 1 into top cross-product.  */
12961     carry++;
12962
12963   resultmid1  = resultmid + xproductmid2;
12964
12965   /* Check for overflow.  */
12966   if (resultmid1 < xproductmid2)
12967     /* Carry over 1 into top cross-product.  */
12968     carry++;
12969
12970   /* Drop lowest 32 bits of middle cross-product.  */
12971   result = resultmid1 >> 32;
12972
12973   /* Add top cross-product plus and any carry.  */
12974   result += xproducthi + carry;
12975
12976   return result;
12977 }
12978
12979 /* Signed multiply high, source, source2 :
12980    64 bit, dest <-- high 64-bit of result.  */
12981 static void
12982 smulh (sim_cpu *cpu)
12983 {
12984   uint64_t uresult;
12985   int64_t  result;
12986   unsigned rm = INSTR (20, 16);
12987   unsigned rn = INSTR (9, 5);
12988   unsigned rd = INSTR (4, 0);
12989   GReg     ra = INSTR (14, 10);
12990   int64_t  value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12991   int64_t  value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
12992   uint64_t uvalue1;
12993   uint64_t uvalue2;
12994   int64_t  signum = 1;
12995
12996   if (ra != R31)
12997     HALT_UNALLOC;
12998
12999   /* Convert to unsigned and use the unsigned mul64hi routine
13000      the fix the sign up afterwards.  */
13001   if (value1 < 0)
13002     {
13003       signum *= -1L;
13004       uvalue1 = -value1;
13005     }
13006   else
13007     {
13008       uvalue1 = value1;
13009     }
13010
13011   if (value2 < 0)
13012     {
13013       signum *= -1L;
13014       uvalue2 = -value2;
13015     }
13016   else
13017     {
13018       uvalue2 = value2;
13019     }
13020
13021   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13022   uresult = mul64hi (uvalue1, uvalue2);
13023   result = uresult;
13024   result *= signum;
13025
13026   aarch64_set_reg_s64 (cpu, rd, NO_SP, result);
13027 }
13028
13029 /* Unsigned multiply add long -- source, source2 :
13030    32 bit, source3 : 64 bit.  */
13031 static void
13032 umaddl (sim_cpu *cpu)
13033 {
13034   unsigned rm = INSTR (20, 16);
13035   unsigned ra = INSTR (14, 10);
13036   unsigned rn = INSTR (9, 5);
13037   unsigned rd = INSTR (4, 0);
13038
13039   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13040   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
13041      obtain a 64 bit product.  */
13042   aarch64_set_reg_u64
13043     (cpu, rd, NO_SP,
13044      aarch64_get_reg_u64 (cpu, ra, NO_SP)
13045      + ((uint64_t) aarch64_get_reg_u32 (cpu, rn, NO_SP))
13046      * ((uint64_t) aarch64_get_reg_u32 (cpu, rm, NO_SP)));
13047 }
13048
13049 /* Unsigned multiply sub long -- source, source2 : 32 bit, source3 : 64 bit.  */
13050 static void
13051 umsubl (sim_cpu *cpu)
13052 {
13053   unsigned rm = INSTR (20, 16);
13054   unsigned ra = INSTR (14, 10);
13055   unsigned rn = INSTR (9, 5);
13056   unsigned rd = INSTR (4, 0);
13057
13058   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13059   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
13060      obtain a 64 bit product.  */
13061   aarch64_set_reg_u64
13062     (cpu, rd, NO_SP,
13063      aarch64_get_reg_u64 (cpu, ra, NO_SP)
13064      - ((uint64_t) aarch64_get_reg_u32 (cpu, rn, NO_SP))
13065      * ((uint64_t) aarch64_get_reg_u32 (cpu, rm, NO_SP)));
13066 }
13067
13068 /* Unsigned multiply high, source, source2 :
13069    64 bit, dest <-- high 64-bit of result.  */
13070 static void
13071 umulh (sim_cpu *cpu)
13072 {
13073   unsigned rm = INSTR (20, 16);
13074   unsigned rn = INSTR (9, 5);
13075   unsigned rd = INSTR (4, 0);
13076   GReg     ra = INSTR (14, 10);
13077
13078   if (ra != R31)
13079     HALT_UNALLOC;
13080
13081   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13082   aarch64_set_reg_u64 (cpu, rd, NO_SP,
13083                        mul64hi (aarch64_get_reg_u64 (cpu, rn, NO_SP),
13084                                 aarch64_get_reg_u64 (cpu, rm, NO_SP)));
13085 }
13086
13087 static void
13088 dexDataProc3Source (sim_cpu *cpu)
13089 {
13090   /* assert instr[28,24] == 11011.  */
13091   /* instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit (for rd at least)
13092      instr[30,29] = op54 : 00 ==> ok, ow ==> UNALLOC
13093      instr[23,21] = op31 : 111 ==> UNALLOC, o2 ==> ok
13094      instr[15] = o0 : 0/1 ==> ok
13095      instr[23,21:15] ==> op : 0000 ==> MADD, 0001 ==> MSUB,     (32/64 bit)
13096                               0010 ==> SMADDL, 0011 ==> SMSUBL, (64 bit only)
13097                               0100 ==> SMULH,                   (64 bit only)
13098                               1010 ==> UMADDL, 1011 ==> UNSUBL, (64 bit only)
13099                               1100 ==> UMULH                    (64 bit only)
13100                               ow ==> UNALLOC.  */
13101
13102   uint32_t dispatch;
13103   uint32_t size = INSTR (31, 31);
13104   uint32_t op54 = INSTR (30, 29);
13105   uint32_t op31 = INSTR (23, 21);
13106   uint32_t o0 = INSTR (15, 15);
13107
13108   if (op54 != 0)
13109     HALT_UNALLOC;
13110
13111   if (size == 0)
13112     {
13113       if (op31 != 0)
13114         HALT_UNALLOC;
13115
13116       if (o0 == 0)
13117         madd32 (cpu);
13118       else
13119         msub32 (cpu);
13120       return;
13121     }
13122
13123   dispatch = (op31 << 1) | o0;
13124
13125   switch (dispatch)
13126     {
13127     case 0:  madd64 (cpu); return;
13128     case 1:  msub64 (cpu); return;
13129     case 2:  smaddl (cpu); return;
13130     case 3:  smsubl (cpu); return;
13131     case 4:  smulh (cpu); return;
13132     case 10: umaddl (cpu); return;
13133     case 11: umsubl (cpu); return;
13134     case 12: umulh (cpu); return;
13135     default: HALT_UNALLOC;
13136     }
13137 }
13138
13139 static void
13140 dexDPReg (sim_cpu *cpu)
13141 {
13142   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
13143      assert  group == GROUP_DPREG_0101 || group == GROUP_DPREG_1101
13144      bits [28:24:21] of a DPReg are the secondary dispatch vector.  */
13145   uint32_t group2 = dispatchDPReg (aarch64_get_instr (cpu));
13146
13147   switch (group2)
13148     {
13149     case DPREG_LOG_000:
13150     case DPREG_LOG_001:
13151       dexLogicalShiftedRegister (cpu); return;
13152
13153     case DPREG_ADDSHF_010:
13154       dexAddSubtractShiftedRegister (cpu); return;
13155
13156     case DPREG_ADDEXT_011:
13157       dexAddSubtractExtendedRegister (cpu); return;
13158
13159     case DPREG_ADDCOND_100:
13160       {
13161         /* This set bundles a variety of different operations.  */
13162         /* Check for.  */
13163         /* 1) add/sub w carry.  */
13164         uint32_t mask1 = 0x1FE00000U;
13165         uint32_t val1  = 0x1A000000U;
13166         /* 2) cond compare register/immediate.  */
13167         uint32_t mask2 = 0x1FE00000U;
13168         uint32_t val2  = 0x1A400000U;
13169         /* 3) cond select.  */
13170         uint32_t mask3 = 0x1FE00000U;
13171         uint32_t val3  = 0x1A800000U;
13172         /* 4) data proc 1/2 source.  */
13173         uint32_t mask4 = 0x1FE00000U;
13174         uint32_t val4  = 0x1AC00000U;
13175
13176         if ((aarch64_get_instr (cpu) & mask1) == val1)
13177           dexAddSubtractWithCarry (cpu);
13178
13179         else if ((aarch64_get_instr (cpu) & mask2) == val2)
13180           CondCompare (cpu);
13181
13182         else if ((aarch64_get_instr (cpu) & mask3) == val3)
13183           dexCondSelect (cpu);
13184
13185         else if ((aarch64_get_instr (cpu) & mask4) == val4)
13186           {
13187             /* Bit 30 is clear for data proc 2 source
13188                and set for data proc 1 source.  */
13189             if (aarch64_get_instr (cpu)  & (1U << 30))
13190               dexDataProc1Source (cpu);
13191             else
13192               dexDataProc2Source (cpu);
13193           }
13194
13195         else
13196           /* Should not reach here.  */
13197           HALT_NYI;
13198
13199         return;
13200       }
13201
13202     case DPREG_3SRC_110:
13203       dexDataProc3Source (cpu); return;
13204
13205     case DPREG_UNALLOC_101:
13206       HALT_UNALLOC;
13207
13208     case DPREG_3SRC_111:
13209       dexDataProc3Source (cpu); return;
13210
13211     default:
13212       /* Should never reach here.  */
13213       HALT_NYI;
13214     }
13215 }
13216
13217 /* Unconditional Branch immediate.
13218    Offset is a PC-relative byte offset in the range +/- 128MiB.
13219    The offset is assumed to be raw from the decode i.e. the
13220    simulator is expected to scale them from word offsets to byte.  */
13221
13222 /* Unconditional branch.  */
13223 static void
13224 buc (sim_cpu *cpu, int32_t offset)
13225 {
13226   aarch64_set_next_PC_by_offset (cpu, offset);
13227 }
13228
13229 static unsigned stack_depth = 0;
13230
13231 /* Unconditional branch and link -- writes return PC to LR.  */
13232 static void
13233 bl (sim_cpu *cpu, int32_t offset)
13234 {
13235   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13236   aarch64_save_LR (cpu);
13237   aarch64_set_next_PC_by_offset (cpu, offset);
13238
13239   if (TRACE_BRANCH_P (cpu))
13240     {
13241       ++ stack_depth;
13242       TRACE_BRANCH (cpu,
13243                     " %*scall %" PRIx64 " [%s]"
13244                     " [args: %" PRIx64 " %" PRIx64 " %" PRIx64 "]",
13245                     stack_depth, " ", aarch64_get_next_PC (cpu),
13246                     aarch64_get_func (CPU_STATE (cpu),
13247                                       aarch64_get_next_PC (cpu)),
13248                     aarch64_get_reg_u64 (cpu, 0, NO_SP),
13249                     aarch64_get_reg_u64 (cpu, 1, NO_SP),
13250                     aarch64_get_reg_u64 (cpu, 2, NO_SP)
13251                     );
13252     }
13253 }
13254
13255 /* Unconditional Branch register.
13256    Branch/return address is in source register.  */
13257
13258 /* Unconditional branch.  */
13259 static void
13260 br (sim_cpu *cpu)
13261 {
13262   unsigned rn = INSTR (9, 5);
13263   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13264   aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
13265 }
13266
13267 /* Unconditional branch and link -- writes return PC to LR.  */
13268 static void
13269 blr (sim_cpu *cpu)
13270 {
13271   unsigned rn = INSTR (9, 5);
13272
13273   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13274   /* The pseudo code in the spec says we update LR before fetching.
13275      the value from the rn.  */
13276   aarch64_save_LR (cpu);
13277   aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
13278
13279   if (TRACE_BRANCH_P (cpu))
13280     {
13281       ++ stack_depth;
13282       TRACE_BRANCH (cpu,
13283                     " %*scall %" PRIx64 " [%s]"
13284                     " [args: %" PRIx64 " %" PRIx64 " %" PRIx64 "]",
13285                     stack_depth, " ", aarch64_get_next_PC (cpu),
13286                     aarch64_get_func (CPU_STATE (cpu),
13287                                       aarch64_get_next_PC (cpu)),
13288                     aarch64_get_reg_u64 (cpu, 0, NO_SP),
13289                     aarch64_get_reg_u64 (cpu, 1, NO_SP),
13290                     aarch64_get_reg_u64 (cpu, 2, NO_SP)
13291                     );
13292     }
13293 }
13294
13295 /* Return -- assembler will default source to LR this is functionally
13296    equivalent to br but, presumably, unlike br it side effects the
13297    branch predictor.  */
13298 static void
13299 ret (sim_cpu *cpu)
13300 {
13301   unsigned rn = INSTR (9, 5);
13302   aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
13303
13304   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13305   if (TRACE_BRANCH_P (cpu))
13306     {
13307       TRACE_BRANCH (cpu,
13308                     " %*sreturn [result: %" PRIx64 "]",
13309                     stack_depth, " ", aarch64_get_reg_u64 (cpu, 0, NO_SP));
13310       -- stack_depth;
13311     }
13312 }
13313
13314 /* NOP -- we implement this and call it from the decode in case we
13315    want to intercept it later.  */
13316
13317 static void
13318 nop (sim_cpu *cpu)
13319 {
13320   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13321 }
13322
13323 /* Data synchronization barrier.  */
13324
13325 static void
13326 dsb (sim_cpu *cpu)
13327 {
13328   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13329 }
13330
13331 /* Data memory barrier.  */
13332
13333 static void
13334 dmb (sim_cpu *cpu)
13335 {
13336   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13337 }
13338
13339 /* Instruction synchronization barrier.  */
13340
13341 static void
13342 isb (sim_cpu *cpu)
13343 {
13344   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13345 }
13346
13347 static void
13348 dexBranchImmediate (sim_cpu *cpu)
13349 {
13350   /* assert instr[30,26] == 00101
13351      instr[31] ==> 0 == B, 1 == BL
13352      instr[25,0] == imm26 branch offset counted in words.  */
13353
13354   uint32_t top = INSTR (31, 31);
13355   /* We have a 26 byte signed word offset which we need to pass to the
13356      execute routine as a signed byte offset.  */
13357   int32_t offset = simm32 (aarch64_get_instr (cpu), 25, 0) << 2;
13358
13359   if (top)
13360     bl (cpu, offset);
13361   else
13362     buc (cpu, offset);
13363 }
13364
13365 /* Control Flow.  */
13366
13367 /* Conditional branch
13368
13369    Offset is a PC-relative byte offset in the range +/- 1MiB pos is
13370    a bit position in the range 0 .. 63
13371
13372    cc is a CondCode enum value as pulled out of the decode
13373
13374    N.B. any offset register (source) can only be Xn or Wn.  */
13375
13376 static void
13377 bcc (sim_cpu *cpu, int32_t offset, CondCode cc)
13378 {
13379   /* The test returns TRUE if CC is met.  */
13380   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13381   if (testConditionCode (cpu, cc))
13382     aarch64_set_next_PC_by_offset (cpu, offset);
13383 }
13384
13385 /* 32 bit branch on register non-zero.  */
13386 static void
13387 cbnz32 (sim_cpu *cpu, int32_t offset)
13388 {
13389   unsigned rt = INSTR (4, 0);
13390
13391   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13392   if (aarch64_get_reg_u32 (cpu, rt, NO_SP) != 0)
13393     aarch64_set_next_PC_by_offset (cpu, offset);
13394 }
13395
13396 /* 64 bit branch on register zero.  */
13397 static void
13398 cbnz (sim_cpu *cpu, int32_t offset)
13399 {
13400   unsigned rt = INSTR (4, 0);
13401
13402   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13403   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) != 0)
13404     aarch64_set_next_PC_by_offset (cpu, offset);
13405 }
13406
13407 /* 32 bit branch on register non-zero.  */
13408 static void
13409 cbz32 (sim_cpu *cpu, int32_t offset)
13410 {
13411   unsigned rt = INSTR (4, 0);
13412
13413   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13414   if (aarch64_get_reg_u32 (cpu, rt, NO_SP) == 0)
13415     aarch64_set_next_PC_by_offset (cpu, offset);
13416 }
13417
13418 /* 64 bit branch on register zero.  */
13419 static void
13420 cbz (sim_cpu *cpu, int32_t offset)
13421 {
13422   unsigned rt = INSTR (4, 0);
13423
13424   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13425   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) == 0)
13426     aarch64_set_next_PC_by_offset (cpu, offset);
13427 }
13428
13429 /* Branch on register bit test non-zero -- one size fits all.  */
13430 static void
13431 tbnz (sim_cpu *cpu, uint32_t  pos, int32_t offset)
13432 {
13433   unsigned rt = INSTR (4, 0);
13434
13435   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13436   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) & (((uint64_t) 1) << pos))
13437     aarch64_set_next_PC_by_offset (cpu, offset);
13438 }
13439
13440 /* Branch on register bit test zero -- one size fits all.  */
13441 static void
13442 tbz (sim_cpu *cpu, uint32_t  pos, int32_t offset)
13443 {
13444   unsigned rt = INSTR (4, 0);
13445
13446   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13447   if (!(aarch64_get_reg_u64 (cpu, rt, NO_SP) & (((uint64_t) 1) << pos)))
13448     aarch64_set_next_PC_by_offset (cpu, offset);
13449 }
13450
13451 static void
13452 dexCompareBranchImmediate (sim_cpu *cpu)
13453 {
13454   /* instr[30,25] = 01 1010
13455      instr[31]    = size : 0 ==> 32, 1 ==> 64
13456      instr[24]    = op : 0 ==> CBZ, 1 ==> CBNZ
13457      instr[23,5]  = simm19 branch offset counted in words
13458      instr[4,0]   = rt  */
13459
13460   uint32_t size = INSTR (31, 31);
13461   uint32_t op   = INSTR (24, 24);
13462   int32_t offset = simm32 (aarch64_get_instr (cpu), 23, 5) << 2;
13463
13464   if (size == 0)
13465     {
13466       if (op == 0)
13467         cbz32 (cpu, offset);
13468       else
13469         cbnz32 (cpu, offset);
13470     }
13471   else
13472     {
13473       if (op == 0)
13474         cbz (cpu, offset);
13475       else
13476         cbnz (cpu, offset);
13477     }
13478 }
13479
13480 static void
13481 dexTestBranchImmediate (sim_cpu *cpu)
13482 {
13483   /* instr[31]    = b5 : bit 5 of test bit idx
13484      instr[30,25] = 01 1011
13485      instr[24]    = op : 0 ==> TBZ, 1 == TBNZ
13486      instr[23,19] = b40 : bits 4 to 0 of test bit idx
13487      instr[18,5]  = simm14 : signed offset counted in words
13488      instr[4,0]   = uimm5  */
13489
13490   uint32_t pos = ((INSTR (31, 31) << 5) | INSTR (23, 19));
13491   int32_t offset = simm32 (aarch64_get_instr (cpu), 18, 5) << 2;
13492
13493   NYI_assert (30, 25, 0x1b);
13494
13495   if (INSTR (24, 24) == 0)
13496     tbz (cpu, pos, offset);
13497   else
13498     tbnz (cpu, pos, offset);
13499 }
13500
13501 static void
13502 dexCondBranchImmediate (sim_cpu *cpu)
13503 {
13504   /* instr[31,25] = 010 1010
13505      instr[24]    = op1; op => 00 ==> B.cond
13506      instr[23,5]  = simm19 : signed offset counted in words
13507      instr[4]     = op0
13508      instr[3,0]   = cond  */
13509
13510   int32_t offset;
13511   uint32_t op = ((INSTR (24, 24) << 1) | INSTR (4, 4));
13512
13513   NYI_assert (31, 25, 0x2a);
13514
13515   if (op != 0)
13516     HALT_UNALLOC;
13517
13518   offset = simm32 (aarch64_get_instr (cpu), 23, 5) << 2;
13519
13520   bcc (cpu, offset, INSTR (3, 0));
13521 }
13522
13523 static void
13524 dexBranchRegister (sim_cpu *cpu)
13525 {
13526   /* instr[31,25] = 110 1011
13527      instr[24,21] = op : 0 ==> BR, 1 => BLR, 2 => RET, 3 => ERET, 4 => DRPS
13528      instr[20,16] = op2 : must be 11111
13529      instr[15,10] = op3 : must be 000000
13530      instr[4,0]   = op2 : must be 11111.  */
13531
13532   uint32_t op = INSTR (24, 21);
13533   uint32_t op2 = INSTR (20, 16);
13534   uint32_t op3 = INSTR (15, 10);
13535   uint32_t op4 = INSTR (4, 0);
13536
13537   NYI_assert (31, 25, 0x6b);
13538
13539   if (op2 != 0x1F || op3 != 0 || op4 != 0)
13540     HALT_UNALLOC;
13541
13542   if (op == 0)
13543     br (cpu);
13544
13545   else if (op == 1)
13546     blr (cpu);
13547
13548   else if (op == 2)
13549     ret (cpu);
13550
13551   else
13552     {
13553       /* ERET and DRPS accept 0b11111 for rn = instr [4,0].  */
13554       /* anything else is unallocated.  */
13555       uint32_t rn = INSTR (4, 0);
13556
13557       if (rn != 0x1f)
13558         HALT_UNALLOC;
13559
13560       if (op == 4 || op == 5)
13561         HALT_NYI;
13562
13563       HALT_UNALLOC;
13564     }
13565 }
13566
13567 /* FIXME: We should get the Angel SWI values from ../../libgloss/aarch64/svc.h
13568    but this may not be available.  So instead we define the values we need
13569    here.  */
13570 #define AngelSVC_Reason_Open            0x01
13571 #define AngelSVC_Reason_Close           0x02
13572 #define AngelSVC_Reason_Write           0x05
13573 #define AngelSVC_Reason_Read            0x06
13574 #define AngelSVC_Reason_IsTTY           0x09
13575 #define AngelSVC_Reason_Seek            0x0A
13576 #define AngelSVC_Reason_FLen            0x0C
13577 #define AngelSVC_Reason_Remove          0x0E
13578 #define AngelSVC_Reason_Rename          0x0F
13579 #define AngelSVC_Reason_Clock           0x10
13580 #define AngelSVC_Reason_Time            0x11
13581 #define AngelSVC_Reason_System          0x12
13582 #define AngelSVC_Reason_Errno           0x13
13583 #define AngelSVC_Reason_GetCmdLine      0x15
13584 #define AngelSVC_Reason_HeapInfo        0x16
13585 #define AngelSVC_Reason_ReportException 0x18
13586 #define AngelSVC_Reason_Elapsed         0x30
13587
13588
13589 static void
13590 handle_halt (sim_cpu *cpu, uint32_t val)
13591 {
13592   uint64_t result = 0;
13593
13594   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13595   if (val != 0xf000)
13596     {
13597       TRACE_SYSCALL (cpu, " HLT [0x%x]", val);
13598       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13599                        sim_stopped, SIM_SIGTRAP);
13600     }
13601
13602   /* We have encountered an Angel SVC call.  See if we can process it.  */
13603   switch (aarch64_get_reg_u32 (cpu, 0, NO_SP))
13604     {
13605     case AngelSVC_Reason_HeapInfo:
13606       {
13607         /* Get the values.  */
13608         uint64_t stack_top = aarch64_get_stack_start (cpu);
13609         uint64_t heap_base = aarch64_get_heap_start (cpu);
13610
13611         /* Get the pointer  */
13612         uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13613         ptr = aarch64_get_mem_u64 (cpu, ptr);
13614
13615         /* Fill in the memory block.  */
13616         /* Start addr of heap.  */
13617         aarch64_set_mem_u64 (cpu, ptr +  0, heap_base);
13618         /* End addr of heap.  */
13619         aarch64_set_mem_u64 (cpu, ptr +  8, stack_top);
13620         /* Lowest stack addr.  */
13621         aarch64_set_mem_u64 (cpu, ptr + 16, heap_base);
13622         /* Initial stack addr.  */
13623         aarch64_set_mem_u64 (cpu, ptr + 24, stack_top);
13624
13625         TRACE_SYSCALL (cpu, " AngelSVC: Get Heap Info");
13626       }
13627       break;
13628
13629     case AngelSVC_Reason_Open:
13630       {
13631         /* Get the pointer  */
13632         /* uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);.  */
13633         /* FIXME: For now we just assume that we will only be asked
13634            to open the standard file descriptors.  */
13635         static int fd = 0;
13636         result = fd ++;
13637
13638         TRACE_SYSCALL (cpu, " AngelSVC: Open file %d", fd - 1);
13639       }
13640       break;
13641
13642     case AngelSVC_Reason_Close:
13643       {
13644         uint64_t fh = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13645         TRACE_SYSCALL (cpu, " AngelSVC: Close file %d", (int) fh);
13646         result = 0;
13647       }
13648       break;
13649
13650     case AngelSVC_Reason_Errno:
13651       result = 0;
13652       TRACE_SYSCALL (cpu, " AngelSVC: Get Errno");
13653       break;
13654
13655     case AngelSVC_Reason_Clock:
13656       result =
13657 #ifdef CLOCKS_PER_SEC
13658         (CLOCKS_PER_SEC >= 100)
13659         ? (clock () / (CLOCKS_PER_SEC / 100))
13660         : ((clock () * 100) / CLOCKS_PER_SEC)
13661 #else
13662         /* Presume unix... clock() returns microseconds.  */
13663         (clock () / 10000)
13664 #endif
13665         ;
13666         TRACE_SYSCALL (cpu, " AngelSVC: Get Clock");
13667       break;
13668
13669     case AngelSVC_Reason_GetCmdLine:
13670       {
13671         /* Get the pointer  */
13672         uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13673         ptr = aarch64_get_mem_u64 (cpu, ptr);
13674
13675         /* FIXME: No command line for now.  */
13676         aarch64_set_mem_u64 (cpu, ptr, 0);
13677         TRACE_SYSCALL (cpu, " AngelSVC: Get Command Line");
13678       }
13679       break;
13680
13681     case AngelSVC_Reason_IsTTY:
13682       result = 1;
13683         TRACE_SYSCALL (cpu, " AngelSVC: IsTTY ?");
13684       break;
13685
13686     case AngelSVC_Reason_Write:
13687       {
13688         /* Get the pointer  */
13689         uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13690         /* Get the write control block.  */
13691         uint64_t fd  = aarch64_get_mem_u64 (cpu, ptr);
13692         uint64_t buf = aarch64_get_mem_u64 (cpu, ptr + 8);
13693         uint64_t len = aarch64_get_mem_u64 (cpu, ptr + 16);
13694
13695         TRACE_SYSCALL (cpu, "write of %" PRIx64 " bytes from %"
13696                        PRIx64 " on descriptor %" PRIx64,
13697                        len, buf, fd);
13698
13699         if (len > 1280)
13700           {
13701             TRACE_SYSCALL (cpu,
13702                            " AngelSVC: Write: Suspiciously long write: %ld",
13703                            (long) len);
13704             sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13705                              sim_stopped, SIM_SIGBUS);
13706           }
13707         else if (fd == 1)
13708           {
13709             printf ("%.*s", (int) len, aarch64_get_mem_ptr (cpu, buf));
13710           }
13711         else if (fd == 2)
13712           {
13713             TRACE (cpu, 0, "\n");
13714             sim_io_eprintf (CPU_STATE (cpu), "%.*s",
13715                             (int) len, aarch64_get_mem_ptr (cpu, buf));
13716             TRACE (cpu, 0, "\n");
13717           }
13718         else
13719           {
13720             TRACE_SYSCALL (cpu,
13721                            " AngelSVC: Write: Unexpected file handle: %d",
13722                            (int) fd);
13723             sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13724                              sim_stopped, SIM_SIGABRT);
13725           }
13726       }
13727       break;
13728
13729     case AngelSVC_Reason_ReportException:
13730       {
13731         /* Get the pointer  */
13732         uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13733         /*ptr = aarch64_get_mem_u64 (cpu, ptr);.  */
13734         uint64_t type = aarch64_get_mem_u64 (cpu, ptr);
13735         uint64_t state = aarch64_get_mem_u64 (cpu, ptr + 8);
13736
13737         TRACE_SYSCALL (cpu,
13738                        "Angel Exception: type 0x%" PRIx64 " state %" PRIx64,
13739                        type, state);
13740
13741         if (type == 0x20026)
13742           sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13743                            sim_exited, state);
13744         else
13745           sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13746                            sim_stopped, SIM_SIGINT);
13747       }
13748       break;
13749
13750     case AngelSVC_Reason_Read:
13751     case AngelSVC_Reason_FLen:
13752     case AngelSVC_Reason_Seek:
13753     case AngelSVC_Reason_Remove:
13754     case AngelSVC_Reason_Time:
13755     case AngelSVC_Reason_System:
13756     case AngelSVC_Reason_Rename:
13757     case AngelSVC_Reason_Elapsed:
13758     default:
13759       TRACE_SYSCALL (cpu, " HLT [Unknown angel %x]",
13760                      aarch64_get_reg_u32 (cpu, 0, NO_SP));
13761       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13762                        sim_stopped, SIM_SIGTRAP);
13763     }
13764
13765   aarch64_set_reg_u64 (cpu, 0, NO_SP, result);
13766 }
13767
13768 static void
13769 dexExcpnGen (sim_cpu *cpu)
13770 {
13771   /* instr[31:24] = 11010100
13772      instr[23,21] = opc : 000 ==> GEN EXCPN, 001 ==> BRK
13773                           010 ==> HLT,       101 ==> DBG GEN EXCPN
13774      instr[20,5]  = imm16
13775      instr[4,2]   = opc2 000 ==> OK, ow ==> UNALLOC
13776      instr[1,0]   = LL : discriminates opc  */
13777
13778   uint32_t opc = INSTR (23, 21);
13779   uint32_t imm16 = INSTR (20, 5);
13780   uint32_t opc2 = INSTR (4, 2);
13781   uint32_t LL;
13782
13783   NYI_assert (31, 24, 0xd4);
13784
13785   if (opc2 != 0)
13786     HALT_UNALLOC;
13787
13788   LL = INSTR (1, 0);
13789
13790   /* We only implement HLT and BRK for now.  */
13791   if (opc == 1 && LL == 0)
13792     {
13793       TRACE_EVENTS (cpu, " BRK [0x%x]", imm16);
13794       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13795                        sim_exited, aarch64_get_reg_s32 (cpu, R0, SP_OK));
13796     }
13797
13798   if (opc == 2 && LL == 0)
13799     handle_halt (cpu, imm16);
13800
13801   else if (opc == 0 || opc == 5)
13802     HALT_NYI;
13803
13804   else
13805     HALT_UNALLOC;
13806 }
13807
13808 /* Stub for accessing system registers.  */
13809
13810 static uint64_t
13811 system_get (sim_cpu *cpu, unsigned op0, unsigned op1, unsigned crn,
13812             unsigned crm, unsigned op2)
13813 {
13814   if (crn == 0 && op1 == 3 && crm == 0 && op2 == 7)
13815     /* DCZID_EL0 - the Data Cache Zero ID register.
13816        We do not support DC ZVA at the moment, so
13817        we return a value with the disable bit set.
13818        We implement support for the DCZID register since
13819        it is used by the C library's memset function.  */
13820     return ((uint64_t) 1) << 4;
13821
13822   if (crn == 0 && op1 == 3 && crm == 0 && op2 == 1)
13823     /* Cache Type Register.  */
13824     return 0x80008000UL;
13825
13826   if (crn == 13 && op1 == 3 && crm == 0 && op2 == 2)
13827     /* TPIDR_EL0 - thread pointer id.  */
13828     return aarch64_get_thread_id (cpu);
13829
13830   if (op1 == 3 && crm == 4 && op2 == 0)
13831     return aarch64_get_FPCR (cpu);
13832
13833   if (op1 == 3 && crm == 4 && op2 == 1)
13834     return aarch64_get_FPSR (cpu);
13835
13836   else if (op1 == 3 && crm == 2 && op2 == 0)
13837     return aarch64_get_CPSR (cpu);
13838
13839   HALT_NYI;
13840 }
13841
13842 static void
13843 system_set (sim_cpu *cpu, unsigned op0, unsigned op1, unsigned crn,
13844             unsigned crm, unsigned op2, uint64_t val)
13845 {
13846   if (op1 == 3 && crm == 4 && op2 == 0)
13847     aarch64_set_FPCR (cpu, val);
13848
13849   else if (op1 == 3 && crm == 4 && op2 == 1)
13850     aarch64_set_FPSR (cpu, val);
13851
13852   else if (op1 == 3 && crm == 2 && op2 == 0)
13853     aarch64_set_CPSR (cpu, val);
13854
13855   else
13856     HALT_NYI;
13857 }
13858
13859 static void
13860 do_mrs (sim_cpu *cpu)
13861 {
13862   /* instr[31:20] = 1101 0101 0001 1
13863      instr[19]    = op0
13864      instr[18,16] = op1
13865      instr[15,12] = CRn
13866      instr[11,8]  = CRm
13867      instr[7,5]   = op2
13868      instr[4,0]   = Rt  */
13869   unsigned sys_op0 = INSTR (19, 19) + 2;
13870   unsigned sys_op1 = INSTR (18, 16);
13871   unsigned sys_crn = INSTR (15, 12);
13872   unsigned sys_crm = INSTR (11, 8);
13873   unsigned sys_op2 = INSTR (7, 5);
13874   unsigned rt = INSTR (4, 0);
13875
13876   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13877   aarch64_set_reg_u64 (cpu, rt, NO_SP,
13878                        system_get (cpu, sys_op0, sys_op1, sys_crn, sys_crm, sys_op2));
13879 }
13880
13881 static void
13882 do_MSR_immediate (sim_cpu *cpu)
13883 {
13884   /* instr[31:19] = 1101 0101 0000 0
13885      instr[18,16] = op1
13886      instr[15,12] = 0100
13887      instr[11,8]  = CRm
13888      instr[7,5]   = op2
13889      instr[4,0]   = 1 1111  */
13890
13891   unsigned op1 = INSTR (18, 16);
13892   /*unsigned crm = INSTR (11, 8);*/
13893   unsigned op2 = INSTR (7, 5);
13894
13895   NYI_assert (31, 19, 0x1AA0);
13896   NYI_assert (15, 12, 0x4);
13897   NYI_assert (4,  0,  0x1F);
13898
13899   if (op1 == 0)
13900     {
13901       if (op2 == 5)
13902         HALT_NYI; /* set SPSel.  */
13903       else
13904         HALT_UNALLOC;
13905     }
13906   else if (op1 == 3)
13907     {
13908       if (op2 == 6)
13909         HALT_NYI; /* set DAIFset.  */
13910       else if (op2 == 7)
13911         HALT_NYI; /* set DAIFclr.  */
13912       else
13913         HALT_UNALLOC;
13914     }
13915   else
13916     HALT_UNALLOC;
13917 }
13918
13919 static void
13920 do_MSR_reg (sim_cpu *cpu)
13921 {
13922   /* instr[31:20] = 1101 0101 0001
13923      instr[19]    = op0
13924      instr[18,16] = op1
13925      instr[15,12] = CRn
13926      instr[11,8]  = CRm
13927      instr[7,5]   = op2
13928      instr[4,0]   = Rt  */
13929
13930   unsigned sys_op0 = INSTR (19, 19) + 2;
13931   unsigned sys_op1 = INSTR (18, 16);
13932   unsigned sys_crn = INSTR (15, 12);
13933   unsigned sys_crm = INSTR (11, 8);
13934   unsigned sys_op2 = INSTR (7, 5);
13935   unsigned rt = INSTR (4, 0);
13936
13937   NYI_assert (31, 20, 0xD51);
13938
13939   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13940   system_set (cpu, sys_op0, sys_op1, sys_crn, sys_crm, sys_op2,
13941               aarch64_get_reg_u64 (cpu, rt, NO_SP));
13942 }
13943
13944 static void
13945 do_SYS (sim_cpu *cpu)
13946 {
13947   /* instr[31,19] = 1101 0101 0000 1
13948      instr[18,16] = op1
13949      instr[15,12] = CRn
13950      instr[11,8]  = CRm
13951      instr[7,5]   = op2
13952      instr[4,0]   = Rt  */
13953   NYI_assert (31, 19, 0x1AA1);
13954
13955   /* FIXME: For now we just silently accept system ops.  */
13956 }
13957
13958 static void
13959 dexSystem (sim_cpu *cpu)
13960 {
13961   /* instr[31:22] = 1101 01010 0
13962      instr[21]    = L
13963      instr[20,19] = op0
13964      instr[18,16] = op1
13965      instr[15,12] = CRn
13966      instr[11,8]  = CRm
13967      instr[7,5]   = op2
13968      instr[4,0]   = uimm5  */
13969
13970   /* We are interested in HINT, DSB, DMB and ISB
13971
13972      Hint #0 encodes NOOP (this is the only hint we care about)
13973      L == 0, op0 == 0, op1 = 011, CRn = 0010, Rt = 11111,
13974      CRm op2  != 0000 000 OR CRm op2 == 0000 000 || CRm op > 0000 101
13975
13976      DSB, DMB, ISB are data store barrier, data memory barrier and
13977      instruction store barrier, respectively, where
13978
13979      L == 0, op0 == 0, op1 = 011, CRn = 0011, Rt = 11111,
13980      op2 : DSB ==> 100, DMB ==> 101, ISB ==> 110
13981      CRm<3:2> ==> domain, CRm<1:0> ==> types,
13982      domain : 00 ==> OuterShareable, 01 ==> Nonshareable,
13983               10 ==> InerShareable, 11 ==> FullSystem
13984      types :  01 ==> Reads, 10 ==> Writes,
13985               11 ==> All, 00 ==> All (domain == FullSystem).  */
13986
13987   unsigned rt = INSTR (4, 0);
13988
13989   NYI_assert (31, 22, 0x354);
13990
13991   switch (INSTR (21, 12))
13992     {
13993     case 0x032:
13994       if (rt == 0x1F)
13995         {
13996           /* NOP has CRm != 0000 OR.  */
13997           /*         (CRm == 0000 AND (op2 == 000 OR op2 > 101)).  */
13998           uint32_t crm = INSTR (11, 8);
13999           uint32_t op2 = INSTR (7, 5);
14000
14001           if (crm != 0 || (op2 == 0 || op2 > 5))
14002             {
14003               /* Actually call nop method so we can reimplement it later.  */
14004               nop (cpu);
14005               return;
14006             }
14007         }
14008       HALT_NYI;
14009
14010     case 0x033:
14011       {
14012         uint32_t op2 =  INSTR (7, 5);
14013
14014         switch (op2)
14015           {
14016           case 2: HALT_NYI;
14017           case 4: dsb (cpu); return;
14018           case 5: dmb (cpu); return;
14019           case 6: isb (cpu); return;
14020           default: HALT_UNALLOC;
14021         }
14022       }
14023
14024     case 0x3B0:
14025     case 0x3B4:
14026     case 0x3BD:
14027       do_mrs (cpu);
14028       return;
14029
14030     case 0x0B7:
14031       do_SYS (cpu); /* DC is an alias of SYS.  */
14032       return;
14033
14034     default:
14035       if (INSTR (21, 20) == 0x1)
14036         do_MSR_reg (cpu);
14037       else if (INSTR (21, 19) == 0 && INSTR (15, 12) == 0x4)
14038         do_MSR_immediate (cpu);
14039       else
14040         HALT_NYI;
14041       return;
14042     }
14043 }
14044
14045 static void
14046 dexBr (sim_cpu *cpu)
14047 {
14048   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
14049      assert  group == GROUP_BREXSYS_1010 || group == GROUP_BREXSYS_1011
14050      bits [31,29] of a BrExSys are the secondary dispatch vector.  */
14051   uint32_t group2 = dispatchBrExSys (aarch64_get_instr (cpu));
14052
14053   switch (group2)
14054     {
14055     case BR_IMM_000:
14056       return dexBranchImmediate (cpu);
14057
14058     case BR_IMMCMP_001:
14059       /* Compare has bit 25 clear while test has it set.  */
14060       if (!INSTR (25, 25))
14061         dexCompareBranchImmediate (cpu);
14062       else
14063         dexTestBranchImmediate (cpu);
14064       return;
14065
14066     case BR_IMMCOND_010:
14067       /* This is a conditional branch if bit 25 is clear otherwise
14068          unallocated.  */
14069       if (!INSTR (25, 25))
14070         dexCondBranchImmediate (cpu);
14071       else
14072         HALT_UNALLOC;
14073       return;
14074
14075     case BR_UNALLOC_011:
14076       HALT_UNALLOC;
14077
14078     case BR_IMM_100:
14079       dexBranchImmediate (cpu);
14080       return;
14081
14082     case BR_IMMCMP_101:
14083       /* Compare has bit 25 clear while test has it set.  */
14084       if (!INSTR (25, 25))
14085         dexCompareBranchImmediate (cpu);
14086       else
14087         dexTestBranchImmediate (cpu);
14088       return;
14089
14090     case BR_REG_110:
14091       /* Unconditional branch reg has bit 25 set.  */
14092       if (INSTR (25, 25))
14093         dexBranchRegister (cpu);
14094
14095       /* This includes both Excpn Gen, System and unalloc operations.
14096          We need to decode the Excpn Gen operation BRK so we can plant
14097          debugger entry points.
14098          Excpn Gen operations have instr [24] = 0.
14099          we need to decode at least one of the System operations NOP
14100          which is an alias for HINT #0.
14101          System operations have instr [24,22] = 100.  */
14102       else if (INSTR (24, 24) == 0)
14103         dexExcpnGen (cpu);
14104
14105       else if (INSTR (24, 22) == 4)
14106         dexSystem (cpu);
14107
14108       else
14109         HALT_UNALLOC;
14110
14111       return;
14112
14113     case BR_UNALLOC_111:
14114       HALT_UNALLOC;
14115
14116     default:
14117       /* Should never reach here.  */
14118       HALT_NYI;
14119     }
14120 }
14121
14122 static void
14123 aarch64_decode_and_execute (sim_cpu *cpu, uint64_t pc)
14124 {
14125   /* We need to check if gdb wants an in here.  */
14126   /* checkBreak (cpu);.  */
14127
14128   uint64_t group = dispatchGroup (aarch64_get_instr (cpu));
14129
14130   switch (group)
14131     {
14132     case GROUP_PSEUDO_0000:   dexPseudo (cpu); break;
14133     case GROUP_LDST_0100:     dexLdSt (cpu); break;
14134     case GROUP_DPREG_0101:    dexDPReg (cpu); break;
14135     case GROUP_LDST_0110:     dexLdSt (cpu); break;
14136     case GROUP_ADVSIMD_0111:  dexAdvSIMD0 (cpu); break;
14137     case GROUP_DPIMM_1000:    dexDPImm (cpu); break;
14138     case GROUP_DPIMM_1001:    dexDPImm (cpu); break;
14139     case GROUP_BREXSYS_1010:  dexBr (cpu); break;
14140     case GROUP_BREXSYS_1011:  dexBr (cpu); break;
14141     case GROUP_LDST_1100:     dexLdSt (cpu); break;
14142     case GROUP_DPREG_1101:    dexDPReg (cpu); break;
14143     case GROUP_LDST_1110:     dexLdSt (cpu); break;
14144     case GROUP_ADVSIMD_1111:  dexAdvSIMD1 (cpu); break;
14145
14146     case GROUP_UNALLOC_0001:
14147     case GROUP_UNALLOC_0010:
14148     case GROUP_UNALLOC_0011:
14149       HALT_UNALLOC;
14150
14151     default:
14152       /* Should never reach here.  */
14153       HALT_NYI;
14154     }
14155 }
14156
14157 static bfd_boolean
14158 aarch64_step (sim_cpu *cpu)
14159 {
14160   uint64_t pc = aarch64_get_PC (cpu);
14161
14162   if (pc == TOP_LEVEL_RETURN_PC)
14163     return FALSE;
14164
14165   aarch64_set_next_PC (cpu, pc + 4);
14166
14167   /* Code is always little-endian.  */
14168   sim_core_read_buffer (CPU_STATE (cpu), cpu, read_map,
14169                         & aarch64_get_instr (cpu), pc, 4);
14170   aarch64_get_instr (cpu) = endian_le2h_4 (aarch64_get_instr (cpu));
14171
14172   TRACE_INSN (cpu, " pc = %" PRIx64 " instr = %08x", pc,
14173               aarch64_get_instr (cpu));
14174   TRACE_DISASM (cpu, pc);
14175
14176   aarch64_decode_and_execute (cpu, pc);
14177
14178   return TRUE;
14179 }
14180
14181 void
14182 aarch64_run (SIM_DESC sd)
14183 {
14184   sim_cpu *cpu = STATE_CPU (sd, 0);
14185
14186   while (aarch64_step (cpu))
14187     {
14188       aarch64_update_PC (cpu);
14189
14190       if (sim_events_tick (sd))
14191         sim_events_process (sd);
14192     }
14193
14194   sim_engine_halt (sd, cpu, NULL, aarch64_get_PC (cpu),
14195                    sim_exited, aarch64_get_reg_s32 (cpu, R0, NO_SP));
14196 }
14197
14198 void
14199 aarch64_init (sim_cpu *cpu, uint64_t pc)
14200 {
14201   uint64_t sp = aarch64_get_stack_start (cpu);
14202
14203   /* Install SP, FP and PC and set LR to -20
14204      so we can detect a top-level return.  */
14205   aarch64_set_reg_u64 (cpu, SP, SP_OK, sp);
14206   aarch64_set_reg_u64 (cpu, FP, SP_OK, sp);
14207   aarch64_set_reg_u64 (cpu, LR, SP_OK, TOP_LEVEL_RETURN_PC);
14208   aarch64_set_next_PC (cpu, pc);
14209   aarch64_update_PC (cpu);
14210   aarch64_init_LIT_table ();
14211 }