x86: further refine SSE check (SSE4a, SHA, GFNI)

[deliverable/binutils-gdb.git] / gas / config / tc-i386.c
diff --git a/gas/config/tc-i386.c b/gas/config/tc-i386.c

index 2d20f1cae9470f57c5529aa6016373396c79f4be..accb6342038a5de8ea520fd19451fa96dce5bf77 100644 (file)
--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -1,5 +1,5 @@
  /* tc-i386.c -- Assemble code for the Intel 80386
-   Copyright (C) 1989-2018 Free Software Foundation, Inc.
+   Copyright (C) 1989-2019 Free Software Foundation, Inc.
  
     This file is part of GAS, the GNU Assembler.
  
@@ -33,6 +33,17 @@
  #include "elf/x86-64.h"
  #include "opcodes/i386-init.h"
  
+#ifdef HAVE_LIMITS_H
+#include <limits.h>
+#else
+#ifdef HAVE_SYS_PARAM_H
+#include <sys/param.h>
+#endif
+#ifndef INT_MAX
+#define INT_MAX (int) (((unsigned) (-1)) >> 1)
+#endif
+#endif
+
  #ifndef REGISTER_WARNINGS
  #define REGISTER_WARNINGS 1
  #endif
@@ -87,6 +98,9 @@
  
  #define END_OF_INSN '\0'
  
+/* This matches the C -> StaticRounding alias in the opcode table.  */
+#define commutative staticrounding
+
  /*
    'templates' is for grouping together 'template' structures for opcodes
    of the same name.  This is only used for storing the insns in the grand
@@ -188,6 +202,13 @@ static void s_bss (int);
  #endif
  #if defined (OBJ_ELF) || defined (OBJ_MAYBE_ELF)
  static void handle_large_common (int small ATTRIBUTE_UNUSED);
+
+/* GNU_PROPERTY_X86_ISA_1_USED.  */
+static unsigned int x86_isa_1_used;
+/* GNU_PROPERTY_X86_FEATURE_2_USED.  */
+static unsigned int x86_feature_2_used;
+/* Generate x86 used ISA and feature properties.  */
+static unsigned int x86_used_note = DEFAULT_X86_USED_NOTE;
  #endif
  
  static const char *default_arch = DEFAULT_ARCH;
@@ -230,6 +251,9 @@ struct Broadcast_Operation
  
    /* Index of broadcasted operand.  */
    int operand;
+
+  /* Number of bytes to broadcast.  */
+  int bytes;
  };
  
  static struct Broadcast_Operation broadcast_op;
@@ -269,7 +293,6 @@ enum i386_error
      invalid_vector_register_set,
      unsupported_vector_index_register,
      unsupported_broadcast,
-    broadcast_not_on_src_operand,
      broadcast_needed,
      unsupported_masking,
      mask_not_on_destination,
@@ -307,6 +330,7 @@ struct _i386_insn
      /* Flags for operands.  */
      unsigned int flags[MAX_OPERANDS];
  #define Operand_PCrel 1
+#define Operand_Mem   2
  
      /* Relocation type for operand */
      enum bfd_reloc_code_real reloc[MAX_OPERANDS];
@@ -329,6 +353,21 @@ struct _i386_insn
      unsigned int prefixes;
      unsigned char prefix[MAX_PREFIXES];
  
+    /* The operand to a branch insn indicates an absolute branch.  */
+    bfd_boolean jumpabsolute;
+
+    /* Has MMX register operands.  */
+    bfd_boolean has_regmmx;
+
+    /* Has XMM register operands.  */
+    bfd_boolean has_regxmm;
+
+    /* Has YMM register operands.  */
+    bfd_boolean has_regymm;
+
+    /* Has ZMM register operands.  */
+    bfd_boolean has_regzmm;
+
      /* RM and SIB are the modrm byte and the sib byte where the
         addressing modes of this insn are encoded.  */
      modrm_byte rm;
@@ -354,7 +393,8 @@ struct _i386_insn
        {
         dir_encoding_default = 0,
         dir_encoding_load,
-       dir_encoding_store
+       dir_encoding_store,
+       dir_encoding_swap
        } dir_encoding;
  
      /* Prefer 8bit or 32bit displacement in encoding.  */
@@ -661,6 +701,13 @@ static enum
      vex256
    } avxscalar;
  
+/* Encode VEX WIG instructions with specific vex.w.  */
+static enum
+  {
+    vexw0 = 0,
+    vexw1
+  } vexwig;
+
  /* Encode scalar EVEX LIG instructions with specific vector length.  */
  static enum
    {
@@ -857,6 +904,10 @@ static const arch_entry cpu_arch[] =
      CPU_387_FLAGS, 0 },
    { STRING_COMMA_LEN (".687"), PROCESSOR_UNKNOWN,
      CPU_687_FLAGS, 0 },
+  { STRING_COMMA_LEN (".cmov"), PROCESSOR_UNKNOWN,
+    CPU_CMOV_FLAGS, 0 },
+  { STRING_COMMA_LEN (".fxsr"), PROCESSOR_UNKNOWN,
+    CPU_FXSR_FLAGS, 0 },
    { STRING_COMMA_LEN (".mmx"), PROCESSOR_UNKNOWN,
      CPU_MMX_FLAGS, 0 },
    { STRING_COMMA_LEN (".sse"), PROCESSOR_UNKNOWN,
@@ -1035,6 +1086,16 @@ static const arch_entry cpu_arch[] =
      CPU_MOVDIRI_FLAGS, 0 },
    { STRING_COMMA_LEN (".movdir64b"), PROCESSOR_UNKNOWN,
      CPU_MOVDIR64B_FLAGS, 0 },
+  { STRING_COMMA_LEN (".avx512_bf16"), PROCESSOR_UNKNOWN,
+    CPU_AVX512_BF16_FLAGS, 0 },
+  { STRING_COMMA_LEN (".avx512_vp2intersect"), PROCESSOR_UNKNOWN,
+    CPU_AVX512_VP2INTERSECT_FLAGS, 0 },
+  { STRING_COMMA_LEN (".enqcmd"), PROCESSOR_UNKNOWN,
+    CPU_ENQCMD_FLAGS, 0 },
+  { STRING_COMMA_LEN (".rdpru"), PROCESSOR_UNKNOWN,
+    CPU_RDPRU_FLAGS, 0 },
+  { STRING_COMMA_LEN (".mcommit"), PROCESSOR_UNKNOWN,
+    CPU_MCOMMIT_FLAGS, 0 },
  };
  
  static const noarch_entry cpu_noarch[] =
@@ -1043,6 +1104,8 @@ static const noarch_entry cpu_noarch[] =
    { STRING_COMMA_LEN ("no287"),  CPU_ANY_287_FLAGS },
    { STRING_COMMA_LEN ("no387"),  CPU_ANY_387_FLAGS },
    { STRING_COMMA_LEN ("no687"),  CPU_ANY_687_FLAGS },
+  { STRING_COMMA_LEN ("nocmov"),  CPU_ANY_CMOV_FLAGS },
+  { STRING_COMMA_LEN ("nofxsr"),  CPU_ANY_FXSR_FLAGS },
    { STRING_COMMA_LEN ("nommx"),  CPU_ANY_MMX_FLAGS },
    { STRING_COMMA_LEN ("nosse"),  CPU_ANY_SSE_FLAGS },
    { STRING_COMMA_LEN ("nosse2"),  CPU_ANY_SSE2_FLAGS },
@@ -1072,6 +1135,9 @@ static const noarch_entry cpu_noarch[] =
    { STRING_COMMA_LEN ("noshstk"), CPU_ANY_SHSTK_FLAGS },
    { STRING_COMMA_LEN ("nomovdiri"), CPU_ANY_MOVDIRI_FLAGS },
    { STRING_COMMA_LEN ("nomovdir64b"), CPU_ANY_MOVDIR64B_FLAGS },
+  { STRING_COMMA_LEN ("noavx512_bf16"), CPU_ANY_AVX512_BF16_FLAGS },
+  { STRING_COMMA_LEN ("noavx512_vp2intersect"), CPU_ANY_SHSTK_FLAGS },
+  { STRING_COMMA_LEN ("noenqcmd"), CPU_ANY_ENQCMD_FLAGS },
  };
  
  #ifdef I386COFF
@@ -1249,7 +1315,16 @@ i386_output_nops (char *where, const unsigned char *const *patt,
    /* Place the longer NOP first.  */
    int last;
    int offset;
-  const unsigned char *nops =  patt[max_single_nop_size - 1];
+  const unsigned char *nops;
+
+  if (max_single_nop_size < 1)
+    {
+      as_fatal (_("i386_output_nops called to generate nops of at most %d bytes!"),
+               max_single_nop_size);
+      return;
+    }
+
+  nops = patt[max_single_nop_size - 1];
  
    /* Use the smaller one if the requsted one isn't available.  */
    if (nops == NULL)
@@ -1539,6 +1614,9 @@ operand_type_set (union i386_operand_type *x, unsigned int v)
      default:
        abort ();
      }
+
+  x->bitfield.class = ClassNone;
+  x->bitfield.instance = InstanceNone;
  }
  
  static INLINE int
@@ -1753,6 +1831,11 @@ cpu_flags_match (const insn_template *t)
  static INLINE i386_operand_type
  operand_type_and (i386_operand_type x, i386_operand_type y)
  {
+  if (x.bitfield.class != y.bitfield.class)
+    x.bitfield.class = ClassNone;
+  if (x.bitfield.instance != y.bitfield.instance)
+    x.bitfield.instance = InstanceNone;
+
    switch (ARRAY_SIZE (x.array))
      {
      case 3:
@@ -1773,6 +1856,9 @@ operand_type_and (i386_operand_type x, i386_operand_type y)
  static INLINE i386_operand_type
  operand_type_and_not (i386_operand_type x, i386_operand_type y)
  {
+  gas_assert (y.bitfield.class == ClassNone);
+  gas_assert (y.bitfield.instance == InstanceNone);
+
    switch (ARRAY_SIZE (x.array))
      {
      case 3:
@@ -1793,6 +1879,13 @@ operand_type_and_not (i386_operand_type x, i386_operand_type y)
  static INLINE i386_operand_type
  operand_type_or (i386_operand_type x, i386_operand_type y)
  {
+  gas_assert (x.bitfield.class == ClassNone ||
+              y.bitfield.class == ClassNone ||
+              x.bitfield.class == y.bitfield.class);
+  gas_assert (x.bitfield.instance == InstanceNone ||
+              y.bitfield.instance == InstanceNone ||
+              x.bitfield.instance == y.bitfield.instance);
+
    switch (ARRAY_SIZE (x.array))
      {
      case 3:
@@ -1813,6 +1906,9 @@ operand_type_or (i386_operand_type x, i386_operand_type y)
  static INLINE i386_operand_type
  operand_type_xor (i386_operand_type x, i386_operand_type y)
  {
+  gas_assert (y.bitfield.class == ClassNone);
+  gas_assert (y.bitfield.instance == InstanceNone);
+
    switch (ARRAY_SIZE (x.array))
      {
      case 3:
@@ -1830,18 +1926,12 @@ operand_type_xor (i386_operand_type x, i386_operand_type y)
    return x;
  }
  
-static const i386_operand_type acc32 = OPERAND_TYPE_ACC32;
-static const i386_operand_type acc64 = OPERAND_TYPE_ACC64;
-static const i386_operand_type inoutportreg
-  = OPERAND_TYPE_INOUTPORTREG;
-static const i386_operand_type reg16_inoutportreg
-  = OPERAND_TYPE_REG16_INOUTPORTREG;
  static const i386_operand_type disp16 = OPERAND_TYPE_DISP16;
  static const i386_operand_type disp32 = OPERAND_TYPE_DISP32;
  static const i386_operand_type disp32s = OPERAND_TYPE_DISP32S;
  static const i386_operand_type disp16_32 = OPERAND_TYPE_DISP16_32;
-static const i386_operand_type anydisp
-  = OPERAND_TYPE_ANYDISP;
+static const i386_operand_type anydisp = OPERAND_TYPE_ANYDISP;
+static const i386_operand_type anyimm = OPERAND_TYPE_ANYIMM;
  static const i386_operand_type regxmm = OPERAND_TYPE_REGXMM;
  static const i386_operand_type regmask = OPERAND_TYPE_REGMASK;
  static const i386_operand_type imm8 = OPERAND_TYPE_IMM8;
@@ -1853,7 +1943,6 @@ static const i386_operand_type imm64 = OPERAND_TYPE_IMM64;
  static const i386_operand_type imm16_32 = OPERAND_TYPE_IMM16_32;
  static const i386_operand_type imm16_32s = OPERAND_TYPE_IMM16_32S;
  static const i386_operand_type imm16_32_32s = OPERAND_TYPE_IMM16_32_32S;
-static const i386_operand_type vec_imm4 = OPERAND_TYPE_VEC_IMM4;
  
  enum operand_type
  {
@@ -1869,7 +1958,7 @@ operand_type_check (i386_operand_type t, enum operand_type c)
    switch (c)
      {
      case reg:
-      return t.bitfield.reg;
+      return t.bitfield.class == Reg;
  
      case imm:
        return (t.bitfield.imm8
@@ -1901,143 +1990,156 @@ operand_type_check (i386_operand_type t, enum operand_type c)
    return 0;
  }
  
-/* Return 1 if there is no conflict in 8bit/16bit/32bit/64bit/80bit on
-   operand J for instruction template T.  */
+/* Return 1 if there is no conflict in 8bit/16bit/32bit/64bit/80bit size
+   between operand GIVEN and opeand WANTED for instruction template T.  */
  
  static INLINE int
-match_reg_size (const insn_template *t, unsigned int j)
+match_operand_size (const insn_template *t, unsigned int wanted,
+                   unsigned int given)
  {
-  return !((i.types[j].bitfield.byte
-           && !t->operand_types[j].bitfield.byte)
-          || (i.types[j].bitfield.word
-              && !t->operand_types[j].bitfield.word)
-          || (i.types[j].bitfield.dword
-              && !t->operand_types[j].bitfield.dword)
-          || (i.types[j].bitfield.qword
-              && !t->operand_types[j].bitfield.qword)
-          || (i.types[j].bitfield.tbyte
-              && !t->operand_types[j].bitfield.tbyte));
+  return !((i.types[given].bitfield.byte
+           && !t->operand_types[wanted].bitfield.byte)
+          || (i.types[given].bitfield.word
+              && !t->operand_types[wanted].bitfield.word)
+          || (i.types[given].bitfield.dword
+              && !t->operand_types[wanted].bitfield.dword)
+          || (i.types[given].bitfield.qword
+              && !t->operand_types[wanted].bitfield.qword)
+          || (i.types[given].bitfield.tbyte
+              && !t->operand_types[wanted].bitfield.tbyte));
  }
  
-/* Return 1 if there is no conflict in SIMD register on
-   operand J for instruction template T.  */
+/* Return 1 if there is no conflict in SIMD register between operand
+   GIVEN and opeand WANTED for instruction template T.  */
  
  static INLINE int
-match_simd_size (const insn_template *t, unsigned int j)
+match_simd_size (const insn_template *t, unsigned int wanted,
+                unsigned int given)
  {
-  return !((i.types[j].bitfield.xmmword
-           && !t->operand_types[j].bitfield.xmmword)
-          || (i.types[j].bitfield.ymmword
-              && !t->operand_types[j].bitfield.ymmword)
-          || (i.types[j].bitfield.zmmword
-              && !t->operand_types[j].bitfield.zmmword));
+  return !((i.types[given].bitfield.xmmword
+           && !t->operand_types[wanted].bitfield.xmmword)
+          || (i.types[given].bitfield.ymmword
+              && !t->operand_types[wanted].bitfield.ymmword)
+          || (i.types[given].bitfield.zmmword
+              && !t->operand_types[wanted].bitfield.zmmword));
  }
  
-/* Return 1 if there is no conflict in any size on operand J for
-   instruction template T.  */
+/* Return 1 if there is no conflict in any size between operand GIVEN
+   and opeand WANTED for instruction template T.  */
  
  static INLINE int
-match_mem_size (const insn_template *t, unsigned int j)
+match_mem_size (const insn_template *t, unsigned int wanted,
+               unsigned int given)
  {
-  return (match_reg_size (t, j)
-         && !((i.types[j].bitfield.unspecified
+  return (match_operand_size (t, wanted, given)
+         && !((i.types[given].bitfield.unspecified
                 && !i.broadcast
-               && !t->operand_types[j].bitfield.unspecified)
-              || (i.types[j].bitfield.fword
-                  && !t->operand_types[j].bitfield.fword)
+               && !t->operand_types[wanted].bitfield.unspecified)
+              || (i.types[given].bitfield.fword
+                  && !t->operand_types[wanted].bitfield.fword)
                /* For scalar opcode templates to allow register and memory
                   operands at the same time, some special casing is needed
                   here.  Also for v{,p}broadcast*, {,v}pmov{s,z}*, and
                   down-conversion vpmov*.  */
-              || ((t->operand_types[j].bitfield.regsimd
+              || ((t->operand_types[wanted].bitfield.class == RegSIMD
                     && !t->opcode_modifier.broadcast
-                   && (t->operand_types[j].bitfield.byte
-                       || t->operand_types[j].bitfield.word
-                       || t->operand_types[j].bitfield.dword
-                       || t->operand_types[j].bitfield.qword))
-                  ? (i.types[j].bitfield.xmmword
-                     || i.types[j].bitfield.ymmword
-                     || i.types[j].bitfield.zmmword)
-                  : !match_simd_size(t, j))));
+                   && (t->operand_types[wanted].bitfield.byte
+                       || t->operand_types[wanted].bitfield.word
+                       || t->operand_types[wanted].bitfield.dword
+                       || t->operand_types[wanted].bitfield.qword))
+                  ? (i.types[given].bitfield.xmmword
+                     || i.types[given].bitfield.ymmword
+                     || i.types[given].bitfield.zmmword)
+                  : !match_simd_size(t, wanted, given))));
  }
  
-/* Return 1 if there is no size conflict on any operands for
-   instruction template T.  */
+/* Return value has MATCH_STRAIGHT set if there is no size conflict on any
+   operands for instruction template T, and it has MATCH_REVERSE set if there
+   is no size conflict on any operands for the template with operands reversed
+   (and the template allows for reversing in the first place).  */
  
-static INLINE int
+#define MATCH_STRAIGHT 1
+#define MATCH_REVERSE  2
+
+static INLINE unsigned int
  operand_size_match (const insn_template *t)
  {
-  unsigned int j;
-  int match = 1;
+  unsigned int j, match = MATCH_STRAIGHT;
  
-  /* Don't check jump instructions.  */
+  /* Don't check non-absolute jump instructions.  */
    if (t->opcode_modifier.jump
-      || t->opcode_modifier.jumpbyte
-      || t->opcode_modifier.jumpdword
-      || t->opcode_modifier.jumpintersegment)
+      && t->opcode_modifier.jump != JUMP_ABSOLUTE)
      return match;
  
    /* Check memory and accumulator operand size.  */
    for (j = 0; j < i.operands; j++)
      {
-      if (!i.types[j].bitfield.reg && !i.types[j].bitfield.regsimd
-         && t->operand_types[j].bitfield.anysize)
+      if (i.types[j].bitfield.class != Reg
+         && i.types[j].bitfield.class != RegSIMD
+         && t->opcode_modifier.anysize)
         continue;
  
-      if (t->operand_types[j].bitfield.reg
-         && !match_reg_size (t, j))
+      if (t->operand_types[j].bitfield.class == Reg
+         && !match_operand_size (t, j, j))
         {
           match = 0;
           break;
         }
  
-      if (t->operand_types[j].bitfield.regsimd
-         && !match_simd_size (t, j))
+      if (t->operand_types[j].bitfield.class == RegSIMD
+         && !match_simd_size (t, j, j))
         {
           match = 0;
           break;
         }
  
-      if (t->operand_types[j].bitfield.acc
-         && (!match_reg_size (t, j) || !match_simd_size (t, j)))
+      if (t->operand_types[j].bitfield.instance == Accum
+         && (!match_operand_size (t, j, j) || !match_simd_size (t, j, j)))
         {
           match = 0;
           break;
         }
  
-      if (i.types[j].bitfield.mem && !match_mem_size (t, j))
+      if ((i.flags[j] & Operand_Mem) && !match_mem_size (t, j, j))
         {
           match = 0;
           break;
         }
      }
  
-  if (match)
-    return match;
-  else if (!t->opcode_modifier.d)
+  if (!t->opcode_modifier.d)
      {
  mismatch:
-      i.error = operand_size_mismatch;
-      return 0;
+      if (!match)
+       i.error = operand_size_mismatch;
+      return match;
      }
  
    /* Check reverse.  */
-  gas_assert (i.operands == 2);
+  gas_assert (i.operands >= 2 && i.operands <= 3);
  
-  match = 1;
-  for (j = 0; j < 2; j++)
+  for (j = 0; j < i.operands; j++)
      {
-      if ((t->operand_types[j].bitfield.reg
-          || t->operand_types[j].bitfield.acc)
-         && !match_reg_size (t, j ? 0 : 1))
+      unsigned int given = i.operands - j - 1;
+
+      if (t->operand_types[j].bitfield.class == Reg
+         && !match_operand_size (t, j, given))
+       goto mismatch;
+
+      if (t->operand_types[j].bitfield.class == RegSIMD
+         && !match_simd_size (t, j, given))
+       goto mismatch;
+
+      if (t->operand_types[j].bitfield.instance == Accum
+         && (!match_operand_size (t, j, given)
+             || !match_simd_size (t, j, given)))
         goto mismatch;
  
-      if (i.types[j].bitfield.mem
-         && !match_mem_size (t, j ? 0 : 1))
+      if ((i.flags[given] & Operand_Mem) && !match_mem_size (t, j, given))
         goto mismatch;
      }
  
-  return match;
+  return match | MATCH_REVERSE;
  }
  
  static INLINE int
@@ -2046,7 +2148,6 @@ operand_type_match (i386_operand_type overlap,
  {
    i386_operand_type temp = overlap;
  
-  temp.bitfield.jumpabsolute = 0;
    temp.bitfield.unspecified = 0;
    temp.bitfield.byte = 0;
    temp.bitfield.word = 0;
@@ -2060,8 +2161,7 @@ operand_type_match (i386_operand_type overlap,
    if (operand_type_all_zero (&temp))
      goto mismatch;
  
-  if (given.bitfield.baseindex == overlap.bitfield.baseindex
-      && given.bitfield.jumpabsolute == overlap.bitfield.jumpabsolute)
+  if (given.bitfield.baseindex == overlap.bitfield.baseindex)
      return 1;
  
  mismatch:
@@ -2080,18 +2180,18 @@ operand_type_register_match (i386_operand_type g0,
                              i386_operand_type g1,
                              i386_operand_type t1)
  {
-  if (!g0.bitfield.reg
-      && !g0.bitfield.regsimd
+  if (g0.bitfield.class != Reg
+      && g0.bitfield.class != RegSIMD
        && (!operand_type_check (g0, anymem)
           || g0.bitfield.unspecified
-         || !t0.bitfield.regsimd))
+         || t0.bitfield.class != RegSIMD))
      return 1;
  
-  if (!g1.bitfield.reg
-      && !g1.bitfield.regsimd
+  if (g1.bitfield.class != Reg
+      && g1.bitfield.class != RegSIMD
        && (!operand_type_check (g1, anymem)
           || g1.bitfield.unspecified
-         || !t1.bitfield.regsimd))
+         || t1.bitfield.class != RegSIMD))
      return 1;
  
    if (g0.bitfield.byte == g1.bitfield.byte
@@ -2929,7 +3029,7 @@ static void pe (expressionS *);
  static void ps (symbolS *);
  
  static void
-pi (char *line, i386_insn *x)
+pi (const char *line, i386_insn *x)
  {
    unsigned int j;
  
@@ -2953,14 +3053,13 @@ pi (char *line, i386_insn *x)
        fprintf (stdout, "    #%d:  ", j + 1);
        pt (x->types[j]);
        fprintf (stdout, "\n");
-      if (x->types[j].bitfield.reg
-         || x->types[j].bitfield.regmmx
-         || x->types[j].bitfield.regsimd
-         || x->types[j].bitfield.sreg2
-         || x->types[j].bitfield.sreg3
-         || x->types[j].bitfield.control
-         || x->types[j].bitfield.debug
-         || x->types[j].bitfield.test)
+      if (x->types[j].bitfield.class == Reg
+         || x->types[j].bitfield.class == RegMMX
+         || x->types[j].bitfield.class == RegSIMD
+         || x->types[j].bitfield.class == SReg
+         || x->types[j].bitfield.class == RegCR
+         || x->types[j].bitfield.class == RegDR
+         || x->types[j].bitfield.class == RegTR)
         fprintf (stdout, "%s\n", x->op[j].regs->reg_name);
        if (operand_type_check (x->types[j], imm))
         pe (x->op[j].imms);
@@ -3030,6 +3129,10 @@ const type_names[] =
    { OPERAND_TYPE_REG16, "r16" },
    { OPERAND_TYPE_REG32, "r32" },
    { OPERAND_TYPE_REG64, "r64" },
+  { OPERAND_TYPE_ACC8, "acc8" },
+  { OPERAND_TYPE_ACC16, "acc16" },
+  { OPERAND_TYPE_ACC32, "acc32" },
+  { OPERAND_TYPE_ACC64, "acc64" },
    { OPERAND_TYPE_IMM8, "i8" },
    { OPERAND_TYPE_IMM8, "i8s" },
    { OPERAND_TYPE_IMM16, "i16" },
@@ -3050,16 +3153,12 @@ const type_names[] =
    { OPERAND_TYPE_DEBUG, "debug reg" },
    { OPERAND_TYPE_FLOATREG, "FReg" },
    { OPERAND_TYPE_FLOATACC, "FAcc" },
-  { OPERAND_TYPE_SREG2, "SReg2" },
-  { OPERAND_TYPE_SREG3, "SReg3" },
-  { OPERAND_TYPE_ACC, "Acc" },
-  { OPERAND_TYPE_JUMPABSOLUTE, "Jump Absolute" },
+  { OPERAND_TYPE_SREG, "SReg" },
    { OPERAND_TYPE_REGMMX, "rMMX" },
    { OPERAND_TYPE_REGXMM, "rXMM" },
    { OPERAND_TYPE_REGYMM, "rYMM" },
    { OPERAND_TYPE_REGZMM, "rZMM" },
    { OPERAND_TYPE_REGMASK, "Mask reg" },
-  { OPERAND_TYPE_ESSEG, "es" },
  };
  
  static void
@@ -3071,7 +3170,7 @@ pt (i386_operand_type t)
    for (j = 0; j < ARRAY_SIZE (type_names); j++)
      {
        a = operand_type_and (t, type_names[j].mask);
-      if (!operand_type_all_zero (&a))
+      if (operand_type_equal (&a, &type_names[j].mask))
         fprintf (stdout, "%s, ",  type_names[j].name);
      }
    fflush (stdout);
@@ -3314,6 +3413,7 @@ build_vex_prefix (const insn_template *t)
    unsigned int register_specifier;
    unsigned int implied_prefix;
    unsigned int vector_length;
+  unsigned int w;
  
    /* Check register specifier.  */
    if (i.vex.register_specifier)
@@ -3325,13 +3425,15 @@ build_vex_prefix (const insn_template *t)
    else
      register_specifier = 0xf;
  
-  /* Use 2-byte VEX prefix by swapping destination and source
-     operand.  */
-  if (i.vec_encoding != vex_encoding_vex3
+  /* Use 2-byte VEX prefix by swapping destination and source operand
+     if there are more than 1 register operand.  */
+  if (i.reg_operands > 1
+      && i.vec_encoding != vex_encoding_vex3
        && i.dir_encoding == dir_encoding_default
        && i.operands == i.reg_operands
+      && operand_type_equal (&i.types[0], &i.types[i.operands - 1])
        && i.tm.opcode_modifier.vexopcode == VEX0F
-      && i.tm.opcode_modifier.load
+      && (i.tm.opcode_modifier.load || i.tm.opcode_modifier.d)
        && i.rex == REX_B)
      {
        unsigned int xchg = i.operands - 1;
@@ -3352,8 +3454,48 @@ build_vex_prefix (const insn_template *t)
        i.rm.regmem = i.rm.reg;
        i.rm.reg = xchg;
  
-      /* Use the next insn.  */
-      i.tm = t[1];
+      if (i.tm.opcode_modifier.d)
+       i.tm.base_opcode ^= (i.tm.base_opcode & 0xee) != 0x6e
+                           ? Opcode_SIMD_FloatD : Opcode_SIMD_IntD;
+      else /* Use the next insn.  */
+       i.tm = t[1];
+    }
+
+  /* Use 2-byte VEX prefix by swapping commutative source operands if there
+     are no memory operands and at least 3 register ones.  */
+  if (i.reg_operands >= 3
+      && i.vec_encoding != vex_encoding_vex3
+      && i.reg_operands == i.operands - i.imm_operands
+      && i.tm.opcode_modifier.vex
+      && i.tm.opcode_modifier.commutative
+      && (i.tm.opcode_modifier.sse2avx || optimize > 1)
+      && i.rex == REX_B
+      && i.vex.register_specifier
+      && !(i.vex.register_specifier->reg_flags & RegRex))
+    {
+      unsigned int xchg = i.operands - i.reg_operands;
+      union i386_op temp_op;
+      i386_operand_type temp_type;
+
+      gas_assert (i.tm.opcode_modifier.vexopcode == VEX0F);
+      gas_assert (!i.tm.opcode_modifier.sae);
+      gas_assert (operand_type_equal (&i.types[i.operands - 2],
+                                      &i.types[i.operands - 3]));
+      gas_assert (i.rm.mode == 3);
+
+      temp_type = i.types[xchg];
+      i.types[xchg] = i.types[xchg + 1];
+      i.types[xchg + 1] = temp_type;
+      temp_op = i.op[xchg];
+      i.op[xchg] = i.op[xchg + 1];
+      i.op[xchg + 1] = temp_op;
+
+      i.rex = 0;
+      xchg = i.rm.regmem | 8;
+      i.rm.regmem = ~register_specifier & 0xf;
+      gas_assert (!(i.rm.regmem & 8));
+      i.vex.register_specifier += xchg - i.rm.regmem;
+      register_specifier = ~xchg & 0xf;
      }
  
    if (i.tm.opcode_modifier.vex == VEXScalar)
@@ -3364,8 +3506,10 @@ build_vex_prefix (const insn_template *t)
      {
        unsigned int op;
  
+      /* Determine vector length from the last multi-length vector
+        operand.  */
        vector_length = 0;
-      for (op = 0; op < t->operands; ++op)
+      for (op = t->operands; op--;)
         if (t->operand_types[op].bitfield.xmmword
             && t->operand_types[op].bitfield.ymmword
             && i.types[op].bitfield.ymmword)
@@ -3393,10 +3537,18 @@ build_vex_prefix (const insn_template *t)
        abort ();
      }
  
+  /* Check the REX.W bit and VEXW.  */
+  if (i.tm.opcode_modifier.vexw == VEXWIG)
+    w = (vexwig == vexw1 || (i.rex & REX_W)) ? 1 : 0;
+  else if (i.tm.opcode_modifier.vexw)
+    w = i.tm.opcode_modifier.vexw == VEXW1 ? 1 : 0;
+  else
+    w = (flag_code == CODE_64BIT ? i.rex & REX_W : vexwig == vexw1) ? 1 : 0;
+
    /* Use 2-byte VEX prefix if possible.  */
-  if (i.vec_encoding != vex_encoding_vex3
+  if (w == 0
+      && i.vec_encoding != vex_encoding_vex3
        && i.tm.opcode_modifier.vexopcode == VEX0F
-      && i.tm.opcode_modifier.vexw != VEXW1
        && (i.rex & (REX_W | REX_X | REX_B)) == 0)
      {
        /* 2-byte VEX prefix.  */
@@ -3415,7 +3567,7 @@ build_vex_prefix (const insn_template *t)
    else
      {
        /* 3-byte VEX prefix.  */
-      unsigned int m, w;
+      unsigned int m;
  
        i.vex.length = 3;
  
@@ -3453,11 +3605,6 @@ build_vex_prefix (const insn_template *t)
          of RXB bits from REX.  */
        i.vex.bytes[1] = (~i.rex & 0x7) << 5 | m;
  
-      /* Check the REX.W bit.  */
-      w = (i.rex & REX_W) ? 1 : 0;
-      if (i.tm.opcode_modifier.vexw == VEXW1)
-       w = 1;
-
        i.vex.bytes[2] = (w << 7
                         | register_specifier << 3
                         | vector_length << 2
@@ -3468,9 +3615,16 @@ build_vex_prefix (const insn_template *t)
  static INLINE bfd_boolean
  is_evex_encoding (const insn_template *t)
  {
-  return t->opcode_modifier.evex
+  return t->opcode_modifier.evex || t->opcode_modifier.disp8memshift
          || t->opcode_modifier.broadcast || t->opcode_modifier.masking
-        || t->opcode_modifier.staticrounding || t->opcode_modifier.sae;
+        || t->opcode_modifier.sae;
+}
+
+static INLINE bfd_boolean
+is_any_vex_encoding (const insn_template *t)
+{
+  return t->opcode_modifier.vex || t->opcode_modifier.vexopcode
+        || is_evex_encoding (t);
  }
  
  /* Build the EVEX prefix.  */
@@ -3576,19 +3730,13 @@ build_evex_prefix (void)
    i.vrex &= ~vrex_used;
    gas_assert (i.vrex == 0);
  
-  /* Check the REX.W bit.  */
-  w = (i.rex & REX_W) ? 1 : 0;
-  if (i.tm.opcode_modifier.vexw)
-    {
-      if (i.tm.opcode_modifier.vexw == VEXW1)
-       w = 1;
-    }
-  /* If w is not set it means we are dealing with WIG instruction.  */
-  else if (!w)
-    {
-      if (evexwig == evexw1)
-        w = 1;
-    }
+  /* Check the REX.W bit and VEXW.  */
+  if (i.tm.opcode_modifier.vexw == VEXWIG)
+    w = (evexwig == evexw1 || (i.rex & REX_W)) ? 1 : 0;
+  else if (i.tm.opcode_modifier.vexw)
+    w = i.tm.opcode_modifier.vexw == VEXW1 ? 1 : 0;
+  else
+    w = (flag_code == CODE_64BIT ? i.rex & REX_W : evexwig == evexw1) ? 1 : 0;
  
    /* Encode the U bit.  */
    implied_prefix |= 0x4;
@@ -3612,22 +3760,51 @@ build_evex_prefix (void)
         {
           unsigned int op;
  
+         /* Determine vector length from the last multi-length vector
+            operand.  */
           vec_length = 0;
-         for (op = 0; op < i.tm.operands; ++op)
+         for (op = i.operands; op--;)
             if (i.tm.operand_types[op].bitfield.xmmword
                 + i.tm.operand_types[op].bitfield.ymmword
                 + i.tm.operand_types[op].bitfield.zmmword > 1)
               {
                 if (i.types[op].bitfield.zmmword)
-                 i.tm.opcode_modifier.evex = EVEX512;
+                 {
+                   i.tm.opcode_modifier.evex = EVEX512;
+                   break;
+                 }
                 else if (i.types[op].bitfield.ymmword)
-                 i.tm.opcode_modifier.evex = EVEX256;
+                 {
+                   i.tm.opcode_modifier.evex = EVEX256;
+                   break;
+                 }
                 else if (i.types[op].bitfield.xmmword)
-                 i.tm.opcode_modifier.evex = EVEX128;
-               else
-                 continue;
-               break;
+                 {
+                   i.tm.opcode_modifier.evex = EVEX128;
+                   break;
+                 }
+               else if (i.broadcast && (int) op == i.broadcast->operand)
+                 {
+                   switch (i.broadcast->bytes)
+                     {
+                       case 64:
+                         i.tm.opcode_modifier.evex = EVEX512;
+                         break;
+                       case 32:
+                         i.tm.opcode_modifier.evex = EVEX256;
+                         break;
+                       case 16:
+                         i.tm.opcode_modifier.evex = EVEX128;
+                         break;
+                       default:
+                         abort ();
+                     }
+                   break;
+                 }
               }
+
+         if (op >= MAX_OPERANDS)
+           abort ();
         }
  
        switch (i.tm.opcode_modifier.evex)
@@ -3670,52 +3847,6 @@ process_immext (void)
  {
    expressionS *exp;
  
-  if ((i.tm.cpu_flags.bitfield.cpusse3 || i.tm.cpu_flags.bitfield.cpusvme)
-      && i.operands > 0)
-    {
-      /* MONITOR/MWAIT as well as SVME instructions have fixed operands
-        with an opcode suffix which is coded in the same place as an
-        8-bit immediate field would be.
-        Here we check those operands and remove them afterwards.  */
-      unsigned int x;
-
-      for (x = 0; x < i.operands; x++)
-       if (register_number (i.op[x].regs) != x)
-         as_bad (_("can't use register '%s%s' as operand %d in '%s'."),
-                 register_prefix, i.op[x].regs->reg_name, x + 1,
-                 i.tm.name);
-
-      i.operands = 0;
-    }
-
-  if (i.tm.cpu_flags.bitfield.cpumwaitx && i.operands > 0)
-    {
-      /* MONITORX/MWAITX instructions have fixed operands with an opcode
-        suffix which is coded in the same place as an 8-bit immediate
-        field would be.
-        Here we check those operands and remove them afterwards.  */
-      unsigned int x;
-
-      if (i.operands != 3)
-       abort();
-
-      for (x = 0; x < 2; x++)
-       if (register_number (i.op[x].regs) != x)
-         goto bad_register_operand;
-
-      /* Check for third operand for mwaitx/monitorx insn.  */
-      if (register_number (i.op[x].regs)
-         != (x + (i.tm.extension_opcode == 0xfb)))
-       {
-bad_register_operand:
-         as_bad (_("can't use register '%s%s' as operand %d in '%s'."),
-                 register_prefix, i.op[x].regs->reg_name, x+1,
-                 i.tm.name);
-       }
-
-      i.operands = 0;
-    }
-
    /* These AMD 3DNow! and SSE2 instructions have an opcode suffix
       which is coded in the same place as an 8-bit immediate field
       would be.  Here we fake an 8-bit immediate operand from the
@@ -3726,9 +3857,7 @@ bad_register_operand:
  
    gas_assert (i.imm_operands <= 1
               && (i.operands <= 2
-                 || ((i.tm.opcode_modifier.vex
-                      || i.tm.opcode_modifier.vexopcode
-                      || is_evex_encoding (&i.tm))
+                 || (is_any_vex_encoding (&i.tm)
                       && i.operands <= 4)));
  
    exp = &im_expressions[i.imm_operands++];
@@ -3766,8 +3895,7 @@ check_hle (void)
                   i.tm.name);
           return 0;
         }
-      if (i.mem_operands == 0
-         || !operand_type_check (i.types[i.operands - 1], anymem))
+      if (i.mem_operands == 0 || !(i.flags[i.operands - 1] & Operand_Mem))
         {
           as_bad (_("memory destination needed for instruction `%s'"
                     " after `xrelease'"), i.tm.name);
@@ -3782,7 +3910,7 @@ check_hle (void)
  static void
  optimize_encoding (void)
  {
-  int j;
+  unsigned int j;
  
    if (optimize_for_space
        && i.reg_operands == 1
@@ -3823,7 +3951,7 @@ optimize_encoding (void)
                 && i.reg_operands == 1
                 && i.imm_operands == 1
                 && i.op[0].imms->X_op == O_constant
-               && ((i.tm.base_opcode == 0xb0
+               && ((i.tm.base_opcode == 0xb8
                      && i.tm.extension_opcode == None
                      && fits_in_unsigned_long (i.op[0].imms->X_add_number))
                     || (fits_in_imm31 (i.op[0].imms->X_add_number)
@@ -3833,8 +3961,11 @@ optimize_encoding (void)
                             || (i.tm.base_opcode == 0x80
                                 && i.tm.extension_opcode == 0x4)
                             || ((i.tm.base_opcode == 0xf6
-                                || i.tm.base_opcode == 0xc6)
-                               && i.tm.extension_opcode == 0x0)))))
+                                || (i.tm.base_opcode | 1) == 0xc7)
+                               && i.tm.extension_opcode == 0x0)))
+                   || (fits_in_imm7 (i.op[0].imms->X_add_number)
+                       && i.tm.base_opcode == 0x83
+                       && i.tm.extension_opcode == 0x4)))
                || (i.types[0].bitfield.qword
                    && ((i.reg_operands == 2
                         && i.op[0].regs == i.op[1].regs
@@ -3848,6 +3979,7 @@ optimize_encoding (void)
      {
        /* Optimize: -O:
            andq $imm31, %r64   -> andl $imm31, %r32
+          andq $imm7, %r64    -> andl $imm7, %r32
            testq $imm31, %r64  -> testl $imm31, %r32
            xorq %r64, %r64     -> xorl %r32, %r32
            subq %r64, %r64     -> subl %r32, %r32
@@ -3855,7 +3987,7 @@ optimize_encoding (void)
            movq $imm32, %r64   -> movl $imm32, %r32
          */
        i.tm.opcode_modifier.norex64 = 1;
-      if (i.tm.base_opcode == 0xb0 || i.tm.base_opcode == 0xc6)
+      if (i.tm.base_opcode == 0xb8 || (i.tm.base_opcode | 1) == 0xc7)
         {
           /* Handle
                movq $imm31, %r64   -> movl $imm31, %r32
@@ -3869,20 +4001,43 @@ optimize_encoding (void)
           i.types[0].bitfield.imm64 = 0;
           i.types[1].bitfield.dword = 1;
           i.types[1].bitfield.qword = 0;
-         if (i.tm.base_opcode == 0xc6)
+         if ((i.tm.base_opcode | 1) == 0xc7)
             {
               /* Handle
                    movq $imm31, %r64   -> movl $imm31, %r32
                */
-             i.tm.base_opcode = 0xb0;
+             i.tm.base_opcode = 0xb8;
               i.tm.extension_opcode = None;
+             i.tm.opcode_modifier.w = 0;
               i.tm.opcode_modifier.shortform = 1;
               i.tm.opcode_modifier.modrm = 0;
             }
         }
      }
    else if (optimize > 1
-          && i.reg_operands == 3
+          && !optimize_for_space
+          && i.reg_operands == 2
+          && i.op[0].regs == i.op[1].regs
+          && ((i.tm.base_opcode & ~(Opcode_D | 1)) == 0x8
+              || (i.tm.base_opcode & ~(Opcode_D | 1)) == 0x20)
+          && (flag_code != CODE_64BIT || !i.types[0].bitfield.dword))
+    {
+      /* Optimize: -O2:
+          andb %rN, %rN  -> testb %rN, %rN
+          andw %rN, %rN  -> testw %rN, %rN
+          andq %rN, %rN  -> testq %rN, %rN
+          orb %rN, %rN   -> testb %rN, %rN
+          orw %rN, %rN   -> testw %rN, %rN
+          orq %rN, %rN   -> testq %rN, %rN
+
+          and outside of 64-bit mode
+
+          andl %rN, %rN  -> testl %rN, %rN
+          orl %rN, %rN   -> testl %rN, %rN
+       */
+      i.tm.base_opcode = 0x84 | (i.tm.base_opcode & 1);
+    }
+  else if (i.reg_operands == 3
            && i.op[0].regs == i.op[1].regs
            && !i.types[2].bitfield.xmmword
            && (i.tm.opcode_modifier.vex
@@ -3890,8 +4045,10 @@ optimize_encoding (void)
                    && !i.rounding
                    && is_evex_encoding (&i.tm)
                    && (i.vec_encoding != vex_encoding_evex
+                      || cpu_arch_isa_flags.bitfield.cpuavx512vl
                        || i.tm.cpu_flags.bitfield.cpuavx512vl
-                      || cpu_arch_isa_flags.bitfield.cpuavx512vl)))
+                      || (i.tm.operand_types[2].bitfield.zmmword
+                          && i.types[2].bitfield.ymmword))))
            && ((i.tm.base_opcode == 0x55
                 || i.tm.base_opcode == 0x6655
                 || i.tm.base_opcode == 0x66df
@@ -3901,18 +4058,22 @@ optimize_encoding (void)
                 || i.tm.base_opcode == 0x66f8
                 || i.tm.base_opcode == 0x66f9
                 || i.tm.base_opcode == 0x66fa
-               || i.tm.base_opcode == 0x66fb)
+               || i.tm.base_opcode == 0x66fb
+               || i.tm.base_opcode == 0x42
+               || i.tm.base_opcode == 0x6642
+               || i.tm.base_opcode == 0x47
+               || i.tm.base_opcode == 0x6647)
                && i.tm.extension_opcode == None))
      {
-      /* Optimize: -O2:
+      /* Optimize: -O1:
            VOP, one of vandnps, vandnpd, vxorps, vxorpd, vpsubb, vpsubd,
            vpsubq and vpsubw:
              EVEX VOP %zmmM, %zmmM, %zmmN
                -> VEX VOP %xmmM, %xmmM, %xmmN (M and N < 16)
-              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)
              EVEX VOP %ymmM, %ymmM, %ymmN
                -> VEX VOP %xmmM, %xmmM, %xmmN (M and N < 16)
-              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)
              VEX VOP %ymmM, %ymmM, %ymmN
                -> VEX VOP %xmmM, %xmmM, %xmmN
            VOP, one of vpandn and vpxor:
@@ -3921,28 +4082,41 @@ optimize_encoding (void)
            VOP, one of vpandnd and vpandnq:
              EVEX VOP %zmmM, %zmmM, %zmmN
                -> VEX vpandn %xmmM, %xmmM, %xmmN (M and N < 16)
-              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)
              EVEX VOP %ymmM, %ymmM, %ymmN
                -> VEX vpandn %xmmM, %xmmM, %xmmN (M and N < 16)
-              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)
            VOP, one of vpxord and vpxorq:
              EVEX VOP %zmmM, %zmmM, %zmmN
                -> VEX vpxor %xmmM, %xmmM, %xmmN (M and N < 16)
-              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)
              EVEX VOP %ymmM, %ymmM, %ymmN
                -> VEX vpxor %xmmM, %xmmM, %xmmN (M and N < 16)
-              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)
+          VOP, one of kxord and kxorq:
+            VEX VOP %kM, %kM, %kN
+              -> VEX kxorw %kM, %kM, %kN
+          VOP, one of kandnd and kandnq:
+            VEX VOP %kM, %kM, %kN
+              -> VEX kandnw %kM, %kM, %kN
         */
        if (is_evex_encoding (&i.tm))
         {
-         if (i.vec_encoding == vex_encoding_evex)
-           i.tm.opcode_modifier.evex = EVEX128;
-         else
+         if (i.vec_encoding != vex_encoding_evex)
             {
               i.tm.opcode_modifier.vex = VEX128;
               i.tm.opcode_modifier.vexw = VEXW0;
               i.tm.opcode_modifier.evex = 0;
             }
+         else if (optimize > 1)
+           i.tm.opcode_modifier.evex = EVEX128;
+         else
+           return;
+       }
+      else if (i.tm.operand_types[0].bitfield.class == RegMask)
+       {
+         i.tm.base_opcode &= 0xff;
+         i.tm.opcode_modifier.vexw = VEXW0;
         }
        else
         i.tm.opcode_modifier.vex = VEX128;
@@ -3954,6 +4128,84 @@ optimize_encoding (void)
             i.types[j].bitfield.ymmword = 0;
           }
      }
+  else if (i.vec_encoding != vex_encoding_evex
+          && !i.types[0].bitfield.zmmword
+          && !i.types[1].bitfield.zmmword
+          && !i.mask
+          && !i.broadcast
+          && is_evex_encoding (&i.tm)
+          && ((i.tm.base_opcode & ~Opcode_SIMD_IntD) == 0x666f
+              || (i.tm.base_opcode & ~Opcode_SIMD_IntD) == 0xf36f
+              || (i.tm.base_opcode & ~Opcode_SIMD_IntD) == 0xf26f
+              || (i.tm.base_opcode & ~4) == 0x66db
+              || (i.tm.base_opcode & ~4) == 0x66eb)
+          && i.tm.extension_opcode == None)
+    {
+      /* Optimize: -O1:
+          VOP, one of vmovdqa32, vmovdqa64, vmovdqu8, vmovdqu16,
+          vmovdqu32 and vmovdqu64:
+            EVEX VOP %xmmM, %xmmN
+              -> VEX vmovdqa|vmovdqu %xmmM, %xmmN (M and N < 16)
+            EVEX VOP %ymmM, %ymmN
+              -> VEX vmovdqa|vmovdqu %ymmM, %ymmN (M and N < 16)
+            EVEX VOP %xmmM, mem
+              -> VEX vmovdqa|vmovdqu %xmmM, mem (M < 16)
+            EVEX VOP %ymmM, mem
+              -> VEX vmovdqa|vmovdqu %ymmM, mem (M < 16)
+            EVEX VOP mem, %xmmN
+              -> VEX mvmovdqa|vmovdquem, %xmmN (N < 16)
+            EVEX VOP mem, %ymmN
+              -> VEX vmovdqa|vmovdqu mem, %ymmN (N < 16)
+          VOP, one of vpand, vpandn, vpor, vpxor:
+            EVEX VOP{d,q} %xmmL, %xmmM, %xmmN
+              -> VEX VOP %xmmL, %xmmM, %xmmN (L, M, and N < 16)
+            EVEX VOP{d,q} %ymmL, %ymmM, %ymmN
+              -> VEX VOP %ymmL, %ymmM, %ymmN (L, M, and N < 16)
+            EVEX VOP{d,q} mem, %xmmM, %xmmN
+              -> VEX VOP mem, %xmmM, %xmmN (M and N < 16)
+            EVEX VOP{d,q} mem, %ymmM, %ymmN
+              -> VEX VOP mem, %ymmM, %ymmN (M and N < 16)
+       */
+      for (j = 0; j < i.operands; j++)
+       if (operand_type_check (i.types[j], disp)
+           && i.op[j].disps->X_op == O_constant)
+         {
+           /* Since the VEX prefix has 2 or 3 bytes, the EVEX prefix
+              has 4 bytes, EVEX Disp8 has 1 byte and VEX Disp32 has 4
+              bytes, we choose EVEX Disp8 over VEX Disp32.  */
+           int evex_disp8, vex_disp8;
+           unsigned int memshift = i.memshift;
+           offsetT n = i.op[j].disps->X_add_number;
+
+           evex_disp8 = fits_in_disp8 (n);
+           i.memshift = 0;
+           vex_disp8 = fits_in_disp8 (n);
+           if (evex_disp8 != vex_disp8)
+             {
+               i.memshift = memshift;
+               return;
+             }
+
+           i.types[j].bitfield.disp8 = vex_disp8;
+           break;
+         }
+      if ((i.tm.base_opcode & ~Opcode_SIMD_IntD) == 0xf26f)
+       i.tm.base_opcode ^= 0xf36f ^ 0xf26f;
+      i.tm.opcode_modifier.vex
+       = i.types[0].bitfield.ymmword ? VEX256 : VEX128;
+      i.tm.opcode_modifier.vexw = VEXW0;
+      /* VPAND, VPOR, and VPXOR are commutative.  */
+      if (i.reg_operands == 3 && i.tm.base_opcode != 0x66df)
+       i.tm.opcode_modifier.commutative = 1;
+      i.tm.opcode_modifier.evex = 0;
+      i.tm.opcode_modifier.masking = 0;
+      i.tm.opcode_modifier.broadcast = 0;
+      i.tm.opcode_modifier.disp8memshift = 0;
+      i.memshift = 0;
+      if (j < i.operands)
+       i.types[j].bitfield.disp8
+         = fits_in_disp8 (i.op[j].disps->X_add_number);
+    }
  }
  
  /* This is the guts of the machine-dependent assembler.  LINE points to a
@@ -4034,14 +4286,17 @@ md_assemble (char *line)
    if (sse_check != check_none
        && !i.tm.opcode_modifier.noavx
        && !i.tm.cpu_flags.bitfield.cpuavx
+      && !i.tm.cpu_flags.bitfield.cpuavx512f
        && (i.tm.cpu_flags.bitfield.cpusse
           || i.tm.cpu_flags.bitfield.cpusse2
           || i.tm.cpu_flags.bitfield.cpusse3
           || i.tm.cpu_flags.bitfield.cpussse3
           || i.tm.cpu_flags.bitfield.cpusse4_1
           || i.tm.cpu_flags.bitfield.cpusse4_2
+         || i.tm.cpu_flags.bitfield.cpusse4a
           || i.tm.cpu_flags.bitfield.cpupclmul
           || i.tm.cpu_flags.bitfield.cpuaes
+         || i.tm.cpu_flags.bitfield.cpusha
           || i.tm.cpu_flags.bitfield.cpugfni))
      {
        (sse_check == check_warning
@@ -4083,12 +4338,19 @@ md_assemble (char *line)
        && (!i.tm.opcode_modifier.islockable
           || i.mem_operands == 0
           || (i.tm.base_opcode != 0x86
-             && !operand_type_check (i.types[i.operands - 1], anymem))))
+             && !(i.flags[i.operands - 1] & Operand_Mem))))
      {
        as_bad (_("expecting lockable instruction after `lock'"));
        return;
      }
  
+  /* Check for data size prefix on VEX/XOP/EVEX encoded insns.  */
+  if (i.prefix[DATA_PREFIX] && is_any_vex_encoding (&i.tm))
+    {
+      as_bad (_("data size prefix invalid with `%s'"), i.tm.name);
+      return;
+    }
+
    /* Check if HLE prefix is OK.  */
    if (i.hle_prefix && !check_hle ())
      return;
@@ -4112,14 +4374,21 @@ md_assemble (char *line)
      }
  
    /* Insert BND prefix.  */
-  if (add_bnd_prefix
-      && i.tm.opcode_modifier.bndprefixok
-      && !i.prefix[BND_PREFIX])
-    add_prefix (BND_PREFIX_OPCODE);
+  if (add_bnd_prefix && i.tm.opcode_modifier.bndprefixok)
+    {
+      if (!i.prefix[BND_PREFIX])
+       add_prefix (BND_PREFIX_OPCODE);
+      else if (i.prefix[BND_PREFIX] != BND_PREFIX_OPCODE)
+       {
+         as_warn (_("replacing `rep'/`repe' prefix by `bnd'"));
+         i.prefix[BND_PREFIX] = BND_PREFIX_OPCODE;
+       }
+    }
  
    /* Check string instruction segment overrides.  */
-  if (i.tm.opcode_modifier.isstring && i.mem_operands != 0)
+  if (i.tm.opcode_modifier.isstring >= IS_STRING_ES_OP0)
      {
+      gas_assert (i.mem_operands);
        if (!check_string ())
         return;
        i.disp_operands = 0;
@@ -4147,9 +4416,8 @@ md_assemble (char *line)
       with 3 operands or less.  */
    if (i.operands <= 3)
      for (j = 0; j < i.operands; j++)
-      if (i.types[j].bitfield.inoutportreg
-         || i.types[j].bitfield.shiftcount
-         || (i.types[j].bitfield.acc && !i.types[j].bitfield.xmmword))
+      if (i.types[j].bitfield.instance != InstanceNone
+         && !i.types[j].bitfield.xmmword)
         i.reg_operands--;
  
    /* ImmExt should be processed after SSE2AVX.  */
@@ -4169,12 +4437,11 @@ md_assemble (char *line)
        as_warn (_("translating to `%sp'"), i.tm.name);
      }
  
-  if (i.tm.opcode_modifier.vex || i.tm.opcode_modifier.vexopcode
-      || is_evex_encoding (&i.tm))
+  if (is_any_vex_encoding (&i.tm))
      {
-      if (flag_code == CODE_16BIT)
+      if (!cpu_arch_flags.bitfield.cpui286)
         {
-         as_bad (_("instruction `%s' isn't supported in 16-bit mode."),
+         as_bad (_("instruction `%s' isn't supported outside of protected mode."),
                   i.tm.name);
           return;
         }
@@ -4196,9 +4463,9 @@ md_assemble (char *line)
        i.imm_operands = 0;
      }
  
-  if ((i.tm.opcode_modifier.jump
-       || i.tm.opcode_modifier.jumpbyte
-       || i.tm.opcode_modifier.jumpdword)
+  if ((i.tm.opcode_modifier.jump == JUMP
+       || i.tm.opcode_modifier.jump == JUMP_BYTE
+       || i.tm.opcode_modifier.jump == JUMP_DWORD)
        && i.op[0].disps->X_op == O_constant)
      {
        /* Convert "jmp constant" (and "call constant") to a jump (call) to
@@ -4215,12 +4482,12 @@ md_assemble (char *line)
       instruction already has a prefix, we need to convert old
       registers to new ones.  */
  
-  if ((i.types[0].bitfield.reg && i.types[0].bitfield.byte
+  if ((i.types[0].bitfield.class == Reg && i.types[0].bitfield.byte
         && (i.op[0].regs->reg_flags & RegRex64) != 0)
-      || (i.types[1].bitfield.reg && i.types[1].bitfield.byte
+      || (i.types[1].bitfield.class == Reg && i.types[1].bitfield.byte
           && (i.op[1].regs->reg_flags & RegRex64) != 0)
-      || (((i.types[0].bitfield.reg && i.types[0].bitfield.byte)
-          || (i.types[1].bitfield.reg && i.types[1].bitfield.byte))
+      || (((i.types[0].bitfield.class == Reg && i.types[0].bitfield.byte)
+          || (i.types[1].bitfield.class == Reg && i.types[1].bitfield.byte))
           && i.rex != 0))
      {
        int x;
@@ -4229,7 +4496,7 @@ md_assemble (char *line)
        for (x = 0; x < 2; x++)
         {
           /* Look for 8 bit operand that uses old registers.  */
-         if (i.types[x].bitfield.reg && i.types[x].bitfield.byte
+         if (i.types[x].bitfield.class == Reg && i.types[x].bitfield.byte
               && (i.op[x].regs->reg_flags & RegRex64) == 0)
             {
               /* In case it is "hi" register, give up.  */
@@ -4254,7 +4521,7 @@ md_assemble (char *line)
          the REX_OPCODE byte.  */
        int x;
        for (x = 0; x < 2; x++)
-       if (i.types[x].bitfield.reg
+       if (i.types[x].bitfield.class == Reg
             && i.types[x].bitfield.byte
             && (i.op[x].regs->reg_flags & RegRex64) == 0
             && i.op[x].regs->reg_num > 3)
@@ -4336,10 +4603,10 @@ parse_insn (char *line, char *mnemonic)
             }
           /* If we are in 16-bit mode, do not allow addr16 or data16.
              Similarly, in 32-bit mode, do not allow addr32 or data32.  */
-         if ((current_templates->start->opcode_modifier.size16
-              || current_templates->start->opcode_modifier.size32)
+         if ((current_templates->start->opcode_modifier.size == SIZE16
+              || current_templates->start->opcode_modifier.size == SIZE32)
               && flag_code != CODE_64BIT
-             && (current_templates->start->opcode_modifier.size32
+             && ((current_templates->start->opcode_modifier.size == SIZE32)
                   ^ (flag_code == CODE_16BIT)))
             {
               as_bad (_("redundant %s prefix"),
@@ -4423,10 +4690,11 @@ parse_insn (char *line, char *mnemonic)
  
    if (!current_templates)
      {
-      /* Check if we should swap operand or force 32bit displacement in
+      /* Deprecated functionality (new code should use pseudo-prefixes instead):
+        Check if we should swap operand or force 32bit displacement in
          encoding.  */
        if (mnem_p - 2 == dot_p && dot_p[1] == 's')
-       i.dir_encoding = dir_encoding_store;
+       i.dir_encoding = dir_encoding_swap;
        else if (mnem_p - 3 == dot_p
                && dot_p[1] == 'd'
                && dot_p[2] == '8')
@@ -4446,46 +4714,50 @@ parse_insn (char *line, char *mnemonic)
    if (!current_templates)
      {
  check_suffix:
-      /* See if we can get a match by trimming off a suffix.  */
-      switch (mnem_p[-1])
+      if (mnem_p > mnemonic)
         {
-       case WORD_MNEM_SUFFIX:
-         if (intel_syntax && (intel_float_operand (mnemonic) & 2))
-           i.suffix = SHORT_MNEM_SUFFIX;
-         else
-           /* Fall through.  */
-       case BYTE_MNEM_SUFFIX:
-       case QWORD_MNEM_SUFFIX:
-         i.suffix = mnem_p[-1];
-         mnem_p[-1] = '\0';
-         current_templates = (const templates *) hash_find (op_hash,
-                                                             mnemonic);
-         break;
-       case SHORT_MNEM_SUFFIX:
-       case LONG_MNEM_SUFFIX:
-         if (!intel_syntax)
+         /* See if we can get a match by trimming off a suffix.  */
+         switch (mnem_p[-1])
             {
-             i.suffix = mnem_p[-1];
-             mnem_p[-1] = '\0';
-             current_templates = (const templates *) hash_find (op_hash,
-                                                                 mnemonic);
-           }
-         break;
-
-         /* Intel Syntax.  */
-       case 'd':
-         if (intel_syntax)
-           {
-             if (intel_float_operand (mnemonic) == 1)
+           case WORD_MNEM_SUFFIX:
+             if (intel_syntax && (intel_float_operand (mnemonic) & 2))
                 i.suffix = SHORT_MNEM_SUFFIX;
               else
-               i.suffix = LONG_MNEM_SUFFIX;
+               /* Fall through.  */
+             case BYTE_MNEM_SUFFIX:
+             case QWORD_MNEM_SUFFIX:
+               i.suffix = mnem_p[-1];
               mnem_p[-1] = '\0';
               current_templates = (const templates *) hash_find (op_hash,
-                                                                 mnemonic);
+                                                                mnemonic);
+             break;
+           case SHORT_MNEM_SUFFIX:
+           case LONG_MNEM_SUFFIX:
+             if (!intel_syntax)
+               {
+                 i.suffix = mnem_p[-1];
+                 mnem_p[-1] = '\0';
+                 current_templates = (const templates *) hash_find (op_hash,
+                                                                    mnemonic);
+               }
+             break;
+
+             /* Intel Syntax.  */
+           case 'd':
+             if (intel_syntax)
+               {
+                 if (intel_float_operand (mnemonic) == 1)
+                   i.suffix = SHORT_MNEM_SUFFIX;
+                 else
+                   i.suffix = LONG_MNEM_SUFFIX;
+                 mnem_p[-1] = '\0';
+                 current_templates = (const templates *) hash_find (op_hash,
+                                                                    mnemonic);
+               }
+             break;
             }
-         break;
         }
+
        if (!current_templates)
         {
           as_bad (_("no such instruction: `%s'"), token_start);
@@ -4493,8 +4765,8 @@ check_suffix:
         }
      }
  
-  if (current_templates->start->opcode_modifier.jump
-      || current_templates->start->opcode_modifier.jumpbyte)
+  if (current_templates->start->opcode_modifier.jump == JUMP
+      || current_templates->start->opcode_modifier.jump == JUMP_BYTE)
      {
        /* Check for a branch hint.  We allow ",pt" and ",pn" for
          predict taken and predict not taken respectively.
@@ -4633,6 +4905,13 @@ parse_operands (char *l, const char *mnemonic)
           /* Now parse operand adding info to 'i' as we go along.  */
           END_STRING_AND_SAVE (l);
  
+         if (i.mem_operands > 1)
+           {
+             as_bad (_("too many memory references for `%s'"),
+                     mnemonic);
+             return 0;
+           }
+
           if (intel_syntax)
             operand_ok =
               i386_intel_operand (token_start,
@@ -4678,14 +4957,21 @@ swap_2_operands (int xchg1, int xchg2)
  {
    union i386_op temp_op;
    i386_operand_type temp_type;
+  unsigned int temp_flags;
    enum bfd_reloc_code_real temp_reloc;
  
    temp_type = i.types[xchg2];
    i.types[xchg2] = i.types[xchg1];
    i.types[xchg1] = temp_type;
+
+  temp_flags = i.flags[xchg2];
+  i.flags[xchg2] = i.flags[xchg1];
+  i.flags[xchg1] = temp_flags;
+
    temp_op = i.op[xchg2];
    i.op[xchg2] = i.op[xchg1];
    i.op[xchg1] = temp_op;
+
    temp_reloc = i.reloc[xchg2];
    i.reloc[xchg2] = i.reloc[xchg1];
    i.reloc[xchg1] = temp_reloc;
@@ -4752,26 +5038,28 @@ optimize_imm (void)
    else if (i.reg_operands)
      {
        /* Figure out a suffix from the last register operand specified.
-        We can't do this properly yet, ie. excluding InOutPortReg,
-        but the following works for instructions with immediates.
-        In any case, we can't set i.suffix yet.  */
+        We can't do this properly yet, i.e. excluding special register
+        instances, but the following works for instructions with
+        immediates.  In any case, we can't set i.suffix yet.  */
        for (op = i.operands; --op >= 0;)
-       if (i.types[op].bitfield.reg && i.types[op].bitfield.byte)
+       if (i.types[op].bitfield.class != Reg)
+         continue;
+       else if (i.types[op].bitfield.byte)
           {
             guess_suffix = BYTE_MNEM_SUFFIX;
             break;
           }
-       else if (i.types[op].bitfield.reg && i.types[op].bitfield.word)
+       else if (i.types[op].bitfield.word)
           {
             guess_suffix = WORD_MNEM_SUFFIX;
             break;
           }
-       else if (i.types[op].bitfield.reg && i.types[op].bitfield.dword)
+       else if (i.types[op].bitfield.dword)
           {
             guess_suffix = LONG_MNEM_SUFFIX;
             break;
           }
-       else if (i.types[op].bitfield.reg && i.types[op].bitfield.qword)
+       else if (i.types[op].bitfield.qword)
           {
             guess_suffix = QWORD_MNEM_SUFFIX;
             break;
@@ -4860,8 +5148,10 @@ optimize_imm (void)
               for (t = current_templates->start;
                    t < current_templates->end;
                    ++t)
-               allowed = operand_type_or (allowed,
-                                          t->operand_types[op]);
+               {
+                 allowed = operand_type_or (allowed, t->operand_types[op]);
+                 allowed = operand_type_and (allowed, anyimm);
+               }
               switch (guess_suffix)
                 {
                 case QWORD_MNEM_SUFFIX:
@@ -4968,6 +5258,22 @@ optimize_disp (void)
        }
  }
  
+/* Return 1 if there is a match in broadcast bytes between operand
+   GIVEN and instruction template T.   */
+
+static INLINE int
+match_broadcast_size (const insn_template *t, unsigned int given)
+{
+  return ((t->opcode_modifier.broadcast == BYTE_BROADCAST
+          && i.types[given].bitfield.byte)
+         || (t->opcode_modifier.broadcast == WORD_BROADCAST
+             && i.types[given].bitfield.word)
+         || (t->opcode_modifier.broadcast == DWORD_BROADCAST
+             && i.types[given].bitfield.dword)
+         || (t->opcode_modifier.broadcast == QWORD_BROADCAST
+             && i.types[given].bitfield.qword));
+}
+
  /* Check if operands are valid for the instruction.  */
  
  static int
@@ -5037,10 +5343,10 @@ check_VecOperands (const insn_template *t)
        gas_assert (i.reg_operands == 2 || i.mask);
        if (i.reg_operands == 2 && !i.mask)
         {
-         gas_assert (i.types[0].bitfield.regsimd);
+         gas_assert (i.types[0].bitfield.class == RegSIMD);
           gas_assert (i.types[0].bitfield.xmmword
                       || i.types[0].bitfield.ymmword);
-         gas_assert (i.types[2].bitfield.regsimd);
+         gas_assert (i.types[2].bitfield.class == RegSIMD);
           gas_assert (i.types[2].bitfield.xmmword
                       || i.types[2].bitfield.ymmword);
           if (operand_check == check_none)
@@ -5061,7 +5367,7 @@ check_VecOperands (const insn_template *t)
         }
        else if (i.reg_operands == 1 && i.mask)
         {
-         if (i.types[1].bitfield.regsimd
+         if (i.types[1].bitfield.class == RegSIMD
               && (i.types[1].bitfield.xmmword
                   || i.types[1].bitfield.ymmword
                   || i.types[1].bitfield.zmmword)
@@ -5086,23 +5392,29 @@ check_VecOperands (const insn_template *t)
        i386_operand_type type, overlap;
  
        /* Check if specified broadcast is supported in this instruction,
-        and it's applied to memory operand of DWORD or QWORD type.  */
+        and its broadcast bytes match the memory operand.  */
        op = i.broadcast->operand;
        if (!t->opcode_modifier.broadcast
-         || !i.types[op].bitfield.mem
+         || !(i.flags[op] & Operand_Mem)
           || (!i.types[op].bitfield.unspecified
-             && (t->operand_types[op].bitfield.dword
-                 ? !i.types[op].bitfield.dword
-                 : !i.types[op].bitfield.qword)))
+             && !match_broadcast_size (t, op)))
         {
         bad_broadcast:
           i.error = unsupported_broadcast;
           return 1;
         }
  
+      i.broadcast->bytes = ((1 << (t->opcode_modifier.broadcast - 1))
+                           * i.broadcast->type);
        operand_type_set (&type, 0);
-      switch ((t->operand_types[op].bitfield.dword ? 4 : 8) * i.broadcast->type)
+      switch (i.broadcast->bytes)
         {
+       case 2:
+         type.bitfield.word = 1;
+         break;
+       case 4:
+         type.bitfield.dword = 1;
+         break;
         case 8:
           type.bitfield.qword = 1;
           break;
@@ -5145,13 +5457,11 @@ check_VecOperands (const insn_template *t)
      {
        /* Find memory operand.  */
        for (op = 0; op < i.operands; op++)
-       if (operand_type_check (i.types[op], anymem))
+       if (i.flags[op] & Operand_Mem)
           break;
        gas_assert (op < i.operands);
        /* Check size of the memory operand.  */
-      if (t->operand_types[op].bitfield.dword
-         ? i.types[op].bitfield.dword
-         : i.types[op].bitfield.qword)
+      if (match_broadcast_size (t, op))
         {
           i.error = broadcast_needed;
           return 1;
@@ -5161,13 +5471,39 @@ check_VecOperands (const insn_template *t)
      op = MAX_OPERANDS - 1; /* Avoid uninitialized variable warning.  */
  
    /* Check if requested masking is supported.  */
-  if (i.mask
-      && (!t->opcode_modifier.masking
-         || (i.mask->zeroing
-             && t->opcode_modifier.masking == MERGING_MASKING)))
+  if (i.mask)
      {
-      i.error = unsupported_masking;
-      return 1;
+      switch (t->opcode_modifier.masking)
+       {
+       case BOTH_MASKING:
+         break;
+       case MERGING_MASKING:
+         if (i.mask->zeroing)
+           {
+       case 0:
+             i.error = unsupported_masking;
+             return 1;
+           }
+         break;
+       case DYNAMIC_MASKING:
+         /* Memory destinations allow only merging masking.  */
+         if (i.mask->zeroing && i.mem_operands)
+           {
+             /* Find memory operand.  */
+             for (op = 0; op < i.operands; op++)
+               if (i.flags[op] & Operand_Mem)
+                 break;
+             gas_assert (op < i.operands);
+             if (op == i.operands - 1)
+               {
+                 i.error = unsupported_masking;
+                 return 1;
+               }
+           }
+         break;
+       default:
+         abort ();
+       }
      }
  
    /* Check if masking is applied to dest operand.  */
@@ -5180,11 +5516,8 @@ check_VecOperands (const insn_template *t)
    /* Check RC/SAE.  */
    if (i.rounding)
      {
-      if ((i.rounding->type != saeonly
-          && !t->opcode_modifier.staticrounding)
-         || (i.rounding->type == saeonly
-             && (t->opcode_modifier.staticrounding
-                 || !t->opcode_modifier.sae)))
+      if (!t->opcode_modifier.sae
+         || (i.rounding->type != saeonly && !t->opcode_modifier.staticrounding))
         {
           i.error = unsupported_rc_sae;
           return 1;
@@ -5205,9 +5538,51 @@ check_VecOperands (const insn_template *t)
        && i.disp_encoding != disp_encoding_32bit)
      {
        if (i.broadcast)
-       i.memshift = t->operand_types[op].bitfield.dword ? 2 : 3;
-      else
+       i.memshift = t->opcode_modifier.broadcast - 1;
+      else if (t->opcode_modifier.disp8memshift != DISP8_SHIFT_VL)
         i.memshift = t->opcode_modifier.disp8memshift;
+      else
+       {
+         const i386_operand_type *type = NULL;
+
+         i.memshift = 0;
+         for (op = 0; op < i.operands; op++)
+           if (i.flags[op] & Operand_Mem)
+             {
+               if (t->opcode_modifier.evex == EVEXLIG)
+                 i.memshift = 2 + (i.suffix == QWORD_MNEM_SUFFIX);
+               else if (t->operand_types[op].bitfield.xmmword
+                        + t->operand_types[op].bitfield.ymmword
+                        + t->operand_types[op].bitfield.zmmword <= 1)
+                 type = &t->operand_types[op];
+               else if (!i.types[op].bitfield.unspecified)
+                 type = &i.types[op];
+             }
+           else if (i.types[op].bitfield.class == RegSIMD
+                    && t->opcode_modifier.evex != EVEXLIG)
+             {
+               if (i.types[op].bitfield.zmmword)
+                 i.memshift = 6;
+               else if (i.types[op].bitfield.ymmword && i.memshift < 5)
+                 i.memshift = 5;
+               else if (i.types[op].bitfield.xmmword && i.memshift < 4)
+                 i.memshift = 4;
+             }
+
+         if (type)
+           {
+             if (type->bitfield.zmmword)
+               i.memshift = 6;
+             else if (type->bitfield.ymmword)
+               i.memshift = 5;
+             else if (type->bitfield.xmmword)
+               i.memshift = 4;
+           }
+
+         /* For the check in fits_in_disp8().  */
+         if (i.memshift == 0)
+           i.memshift = -1;
+       }
  
        for (op = 0; op < i.operands; op++)
         if (operand_type_check (i.types[op], disp)
@@ -5255,8 +5630,8 @@ VEX_check_operands (const insn_template *t)
        return 0;
      }
  
-  /* Only check VEX_Imm4, which must be the first operand.  */
-  if (t->operand_types[0].bitfield.vec_imm4)
+  /* Check the special Imm4 cases; must be the first operand.  */
+  if (t->cpu_flags.bitfield.cpuxop && t->operands == 5)
      {
        if (i.op[0].imms->X_op != O_constant
           || !fits_in_imm4 (i.op[0].imms->X_add_number))
@@ -5265,8 +5640,8 @@ VEX_check_operands (const insn_template *t)
           return 1;
         }
  
-      /* Turn off Imm8 so that update_imm won't complain.  */
-      i.types[0] = vec_imm4;
+      /* Turn off Imm<N> so that update_imm won't complain.  */
+      operand_type_set (&i.types[0], 0);
      }
  
    return 0;
@@ -5280,11 +5655,11 @@ match_template (char mnem_suffix)
    i386_operand_type overlap0, overlap1, overlap2, overlap3;
    i386_operand_type overlap4;
    unsigned int found_reverse_match;
-  i386_opcode_modifier suffix_check, mnemsuf_check;
+  i386_opcode_modifier suffix_check;
    i386_operand_type operand_types [MAX_OPERANDS];
    int addr_prefix_disp;
    unsigned int j;
-  unsigned int found_cpu_match;
+  unsigned int found_cpu_match, size_match;
    unsigned int check_register;
    enum i386_error specific_error = 0;
  
@@ -5295,33 +5670,33 @@ match_template (char mnem_suffix)
    found_reverse_match = 0;
    addr_prefix_disp = -1;
  
+  /* Prepare for mnemonic suffix check.  */
    memset (&suffix_check, 0, sizeof (suffix_check));
-  if (intel_syntax && i.broadcast)
-    /* nothing */;
-  else if (i.suffix == BYTE_MNEM_SUFFIX)
-    suffix_check.no_bsuf = 1;
-  else if (i.suffix == WORD_MNEM_SUFFIX)
-    suffix_check.no_wsuf = 1;
-  else if (i.suffix == SHORT_MNEM_SUFFIX)
-    suffix_check.no_ssuf = 1;
-  else if (i.suffix == LONG_MNEM_SUFFIX)
-    suffix_check.no_lsuf = 1;
-  else if (i.suffix == QWORD_MNEM_SUFFIX)
-    suffix_check.no_qsuf = 1;
-  else if (i.suffix == LONG_DOUBLE_MNEM_SUFFIX)
-    suffix_check.no_ldsuf = 1;
-
-  memset (&mnemsuf_check, 0, sizeof (mnemsuf_check));
-  if (intel_syntax)
+  switch (mnem_suffix)
      {
-      switch (mnem_suffix)
-       {
-       case BYTE_MNEM_SUFFIX:  mnemsuf_check.no_bsuf = 1; break;
-       case WORD_MNEM_SUFFIX:  mnemsuf_check.no_wsuf = 1; break;
-       case SHORT_MNEM_SUFFIX: mnemsuf_check.no_ssuf = 1; break;
-       case LONG_MNEM_SUFFIX:  mnemsuf_check.no_lsuf = 1; break;
-       case QWORD_MNEM_SUFFIX: mnemsuf_check.no_qsuf = 1; break;
-       }
+    case BYTE_MNEM_SUFFIX:
+      suffix_check.no_bsuf = 1;
+      break;
+    case WORD_MNEM_SUFFIX:
+      suffix_check.no_wsuf = 1;
+      break;
+    case SHORT_MNEM_SUFFIX:
+      suffix_check.no_ssuf = 1;
+      break;
+    case LONG_MNEM_SUFFIX:
+      suffix_check.no_lsuf = 1;
+      break;
+    case QWORD_MNEM_SUFFIX:
+      suffix_check.no_qsuf = 1;
+      break;
+    default:
+      /* NB: In Intel syntax, normally we can check for memory operand
+        size when there is no mnemonic suffix.  But jmp and call have
+        2 different encodings with Dword memory operand size, one with
+        No_ldSuf and the other without.  i.suffix is set to
+        LONG_DOUBLE_MNEM_SUFFIX to skip the one with No_ldSuf.  */
+      if (i.suffix == LONG_DOUBLE_MNEM_SUFFIX)
+       suffix_check.no_ldsuf = 1;
      }
  
    /* Must have right number of operands.  */
@@ -5330,6 +5705,7 @@ match_template (char mnem_suffix)
    for (t = current_templates->start; t < current_templates->end; t++)
      {
        addr_prefix_disp = -1;
+      found_reverse_match = 0;
  
        if (i.operands != t->operands)
         continue;
@@ -5354,28 +5730,32 @@ match_template (char mnem_suffix)
           || (!intel64 && t->opcode_modifier.intel64))
         continue;
  
-      /* Check the suffix, except for some instructions in intel mode.  */
+      /* Check the suffix.  */
        i.error = invalid_instruction_suffix;
-      if ((!intel_syntax || !t->opcode_modifier.ignoresize)
-         && ((t->opcode_modifier.no_bsuf && suffix_check.no_bsuf)
-             || (t->opcode_modifier.no_wsuf && suffix_check.no_wsuf)
-             || (t->opcode_modifier.no_lsuf && suffix_check.no_lsuf)
-             || (t->opcode_modifier.no_ssuf && suffix_check.no_ssuf)
-             || (t->opcode_modifier.no_qsuf && suffix_check.no_qsuf)
-             || (t->opcode_modifier.no_ldsuf && suffix_check.no_ldsuf)))
-       continue;
-      /* In Intel mode all mnemonic suffixes must be explicitly allowed.  */
-      if ((t->opcode_modifier.no_bsuf && mnemsuf_check.no_bsuf)
-         || (t->opcode_modifier.no_wsuf && mnemsuf_check.no_wsuf)
-         || (t->opcode_modifier.no_lsuf && mnemsuf_check.no_lsuf)
-         || (t->opcode_modifier.no_ssuf && mnemsuf_check.no_ssuf)
-         || (t->opcode_modifier.no_qsuf && mnemsuf_check.no_qsuf)
-         || (t->opcode_modifier.no_ldsuf && mnemsuf_check.no_ldsuf))
+      if ((t->opcode_modifier.no_bsuf && suffix_check.no_bsuf)
+         || (t->opcode_modifier.no_wsuf && suffix_check.no_wsuf)
+         || (t->opcode_modifier.no_lsuf && suffix_check.no_lsuf)
+         || (t->opcode_modifier.no_ssuf && suffix_check.no_ssuf)
+         || (t->opcode_modifier.no_qsuf && suffix_check.no_qsuf)
+         || (t->opcode_modifier.no_ldsuf && suffix_check.no_ldsuf))
         continue;
  
-      if (!operand_size_match (t))
+      size_match = operand_size_match (t);
+      if (!size_match)
         continue;
  
+      /* This is intentionally not
+
+        if (i.jumpabsolute != (t->opcode_modifier.jump == JUMP_ABSOLUTE))
+
+        as the case of a missing * on the operand is accepted (perhaps with
+        a warning, issued further down).  */
+      if (i.jumpabsolute && t->opcode_modifier.jump != JUMP_ABSOLUTE)
+       {
+         i.error = operand_type_mismatch;
+         continue;
+       }
+
        for (j = 0; j < MAX_OPERANDS; j++)
         operand_types[j] = t->operand_types[j];
  
@@ -5384,12 +5764,13 @@ match_template (char mnem_suffix)
           && flag_code != CODE_64BIT
           && (intel_syntax
               ? (!t->opcode_modifier.ignoresize
+                && !t->opcode_modifier.broadcast
                  && !intel_float_operand (t->name))
               : intel_float_operand (t->name) != 2)
-         && ((!operand_types[0].bitfield.regmmx
-              && !operand_types[0].bitfield.regsimd)
-             || (!operand_types[t->operands > 1].bitfield.regmmx
-                 && !operand_types[t->operands > 1].bitfield.regsimd))
+         && ((operand_types[0].bitfield.class != RegMMX
+              && operand_types[0].bitfield.class != RegSIMD)
+             || (operand_types[t->operands > 1].bitfield.class != RegMMX
+                 && operand_types[t->operands > 1].bitfield.class != RegSIMD))
           && (t->base_opcode != 0x0fc7
               || t->extension_opcode != 1 /* cmpxchg8b */))
         continue;
@@ -5401,10 +5782,11 @@ match_template (char mnem_suffix)
                    ? (!t->opcode_modifier.ignoresize
                       && !intel_float_operand (t->name))
                    : intel_float_operand (t->name) != 2)
-              && ((!operand_types[0].bitfield.regmmx
-                   && !operand_types[0].bitfield.regsimd)
-                  || (!operand_types[t->operands > 1].bitfield.regmmx
-                      && !operand_types[t->operands > 1].bitfield.regsimd)))
+              && ((operand_types[0].bitfield.class != RegMMX
+                   && operand_types[0].bitfield.class != RegSIMD)
+                  || (operand_types[t->operands > 1].bitfield.class != RegMMX
+                      && operand_types[t->operands > 1].bitfield.class
+                         != RegSIMD)))
         continue;
  
        /* Do not verify operands when there are none.  */
@@ -5489,26 +5871,55 @@ match_template (char mnem_suffix)
              zero-extend %eax to %rax.  */
           if (flag_code == CODE_64BIT
               && t->base_opcode == 0x90
-             && operand_type_equal (&i.types [0], &acc32)
-             && operand_type_equal (&i.types [1], &acc32))
+             && i.types[0].bitfield.instance == Accum
+             && i.types[0].bitfield.dword
+             && i.types[1].bitfield.instance == Accum
+             && i.types[1].bitfield.dword)
             continue;
           /* xrelease mov %eax, <disp> is another special case. It must not
              match the accumulator-only encoding of mov.  */
           if (flag_code != CODE_64BIT
               && i.hle_prefix
               && t->base_opcode == 0xa0
-             && i.types[0].bitfield.acc
-             && operand_type_check (i.types[1], anymem))
+             && i.types[0].bitfield.instance == Accum
+             && (i.flags[1] & Operand_Mem))
             continue;
-         /* If we want store form, we reverse direction of operands.  */
-         if (i.dir_encoding == dir_encoding_store
-             && t->opcode_modifier.d)
-           goto check_reverse;
           /* Fall through.  */
  
         case 3:
+         if (!(size_match & MATCH_STRAIGHT))
+           goto check_reverse;
+         /* Reverse direction of operands if swapping is possible in the first
+            place (operands need to be symmetric) and
+            - the load form is requested, and the template is a store form,
+            - the store form is requested, and the template is a load form,
+            - the non-default (swapped) form is requested.  */
+         overlap1 = operand_type_and (operand_types[0], operand_types[1]);
+         if (t->opcode_modifier.d && i.reg_operands == i.operands
+             && !operand_type_all_zero (&overlap1))
+           switch (i.dir_encoding)
+             {
+             case dir_encoding_load:
+               if (operand_type_check (operand_types[i.operands - 1], anymem)
+                   || t->opcode_modifier.regmem)
+                 goto check_reverse;
+               break;
+
+             case dir_encoding_store:
+               if (!operand_type_check (operand_types[i.operands - 1], anymem)
+                   && !t->opcode_modifier.regmem)
+                 goto check_reverse;
+               break;
+
+             case dir_encoding_swap:
+               goto check_reverse;
+
+             case dir_encoding_default:
+               break;
+             }
           /* If we want store form, we skip the current load.  */
-         if (i.dir_encoding == dir_encoding_store
+         if ((i.dir_encoding == dir_encoding_store
+              || i.dir_encoding == dir_encoding_swap)
               && i.mem_operands == 0
               && t->opcode_modifier.load)
             continue;
@@ -5529,15 +5940,17 @@ match_template (char mnem_suffix)
                 continue;
  
  check_reverse:
+             if (!(size_match & MATCH_REVERSE))
+               continue;
               /* Try reversing direction of operands.  */
-             overlap0 = operand_type_and (i.types[0], operand_types[1]);
-             overlap1 = operand_type_and (i.types[1], operand_types[0]);
+             overlap0 = operand_type_and (i.types[0], operand_types[i.operands - 1]);
+             overlap1 = operand_type_and (i.types[i.operands - 1], operand_types[0]);
               if (!operand_type_match (overlap0, i.types[0])
-                 || !operand_type_match (overlap1, i.types[1])
+                 || !operand_type_match (overlap1, i.types[i.operands - 1])
                   || (check_register
                       && !operand_type_register_match (i.types[0],
-                                                      operand_types[1],
-                                                      i.types[1],
+                                                      operand_types[i.operands - 1],
+                                                      i.types[i.operands - 1],
                                                        operand_types[0])))
                 {
                   /* Does not match either direction.  */
@@ -5549,6 +5962,13 @@ check_reverse:
                 found_reverse_match = 0;
               else if (operand_types[0].bitfield.tbyte)
                 found_reverse_match = Opcode_FloatD;
+             else if (operand_types[0].bitfield.xmmword
+                      || operand_types[i.operands - 1].bitfield.xmmword
+                      || operand_types[0].bitfield.class == RegMMX
+                      || operand_types[i.operands - 1].bitfield.class == RegMMX
+                      || is_any_vex_encoding(t))
+               found_reverse_match = (t->base_opcode & 0xee) != 0x6e
+                                     ? Opcode_SIMD_FloatD : Opcode_SIMD_IntD;
               else
                 found_reverse_match = Opcode_D;
               if (t->opcode_modifier.floatr)
@@ -5619,10 +6039,7 @@ check_reverse:
              slip through to break.  */
         }
        if (!found_cpu_match)
-       {
-         found_reverse_match = 0;
-         continue;
-       }
+       continue;
  
        /* Check if vector and VEX operands are valid.  */
        if (check_VecOperands (t) || VEX_check_operands (t))
@@ -5683,9 +6100,6 @@ check_reverse:
         case unsupported_broadcast:
           err_msg = _("unsupported broadcast");
           break;
-       case broadcast_not_on_src_operand:
-         err_msg = _("broadcast not on source memory operand");
-         break;
         case broadcast_needed:
           err_msg = _("broadcast is needed for operand of such type");
           break;
@@ -5719,11 +6133,8 @@ check_reverse:
    if (!quiet_warnings)
      {
        if (!intel_syntax
-         && (i.types[0].bitfield.jumpabsolute
-             != operand_types[0].bitfield.jumpabsolute))
-       {
-         as_warn (_("indirect %s without `*'"), t->name);
-       }
+         && (i.jumpabsolute != (t->opcode_modifier.jump == JUMP_ABSOLUTE)))
+       as_warn (_("indirect %s without `*'"), t->name);
  
        if (t->opcode_modifier.isprefix
           && t->opcode_modifier.ignoresize)
@@ -5743,14 +6154,22 @@ check_reverse:
  
    if (found_reverse_match)
      {
-      /* If we found a reverse match we must alter the opcode
-        direction bit.  found_reverse_match holds bits to change
-        (different for int & float insns).  */
+      /* If we found a reverse match we must alter the opcode direction
+        bit and clear/flip the regmem modifier one.  found_reverse_match
+        holds bits to change (different for int & float insns).  */
  
        i.tm.base_opcode ^= found_reverse_match;
  
-      i.tm.operand_types[0] = operand_types[1];
-      i.tm.operand_types[1] = operand_types[0];
+      i.tm.operand_types[0] = operand_types[i.operands - 1];
+      i.tm.operand_types[i.operands - 1] = operand_types[0];
+
+      /* Certain SIMD insns have their load forms specified in the opcode
+        table, and hence we need to _set_ RegMem instead of clearing it.
+        We need to avoid setting the bit though on insns like KMOVW.  */
+      i.tm.opcode_modifier.regmem
+       = i.tm.opcode_modifier.modrm && i.tm.opcode_modifier.d
+         && i.tm.operands > 2U - i.tm.opcode_modifier.sse2avx
+         && !i.tm.opcode_modifier.regmem;
      }
  
    return t;
@@ -5759,34 +6178,24 @@ check_reverse:
  static int
  check_string (void)
  {
-  int mem_op = operand_type_check (i.types[0], anymem) ? 0 : 1;
-  if (i.tm.operand_types[mem_op].bitfield.esseg)
-    {
-      if (i.seg[0] != NULL && i.seg[0] != &es)
-       {
-         as_bad (_("`%s' operand %d must use `%ses' segment"),
-                 i.tm.name,
-                 mem_op + 1,
-                 register_prefix);
-         return 0;
-       }
-      /* There's only ever one segment override allowed per instruction.
-        This instruction possibly has a legal segment override on the
-        second operand, so copy the segment to where non-string
-        instructions store it, allowing common code.  */
-      i.seg[0] = i.seg[1];
-    }
-  else if (i.tm.operand_types[mem_op + 1].bitfield.esseg)
+  unsigned int es_op = i.tm.opcode_modifier.isstring - IS_STRING_ES_OP0;
+  unsigned int op = i.tm.operand_types[0].bitfield.baseindex ? es_op : 0;
+
+  if (i.seg[op] != NULL && i.seg[op] != &es)
      {
-      if (i.seg[1] != NULL && i.seg[1] != &es)
-       {
-         as_bad (_("`%s' operand %d must use `%ses' segment"),
-                 i.tm.name,
-                 mem_op + 2,
-                 register_prefix);
-         return 0;
-       }
+      as_bad (_("`%s' operand %u must use `%ses' segment"),
+             i.tm.name,
+             intel_syntax ? i.tm.operands - es_op : es_op + 1,
+             register_prefix);
+      return 0;
      }
+
+  /* There's only ever one segment override allowed per instruction.
+     This instruction possibly has a legal segment override on the
+     second operand, so copy the segment to where non-string
+     instructions store it, allowing common code.  */
+  i.seg[op] = i.seg[1];
+
    return 1;
  }
  
@@ -5795,43 +6204,41 @@ process_suffix (void)
  {
    /* If matched instruction specifies an explicit instruction mnemonic
       suffix, use it.  */
-  if (i.tm.opcode_modifier.size16)
+  if (i.tm.opcode_modifier.size == SIZE16)
      i.suffix = WORD_MNEM_SUFFIX;
-  else if (i.tm.opcode_modifier.size32)
+  else if (i.tm.opcode_modifier.size == SIZE32)
      i.suffix = LONG_MNEM_SUFFIX;
-  else if (i.tm.opcode_modifier.size64)
+  else if (i.tm.opcode_modifier.size == SIZE64)
      i.suffix = QWORD_MNEM_SUFFIX;
-  else if (i.reg_operands)
+  else if (i.reg_operands
+          && (i.operands > 1 || i.types[0].bitfield.class == Reg))
      {
        /* If there's no instruction mnemonic suffix we try to invent one
-        based on register operands.  */
+        based on GPR operands.  */
        if (!i.suffix)
         {
           /* We take i.suffix from the last register operand specified,
              Destination register type is more significant than source
              register type.  crc32 in SSE4.2 prefers source register
              type. */
-         if (i.tm.base_opcode == 0xf20f38f1)
+         if (i.tm.base_opcode == 0xf20f38f0
+             && i.types[0].bitfield.class == Reg)
             {
-             if (i.types[0].bitfield.reg && i.types[0].bitfield.word)
+             if (i.types[0].bitfield.byte)
+               i.suffix = BYTE_MNEM_SUFFIX;
+             else if (i.types[0].bitfield.word)
                 i.suffix = WORD_MNEM_SUFFIX;
-             else if (i.types[0].bitfield.reg && i.types[0].bitfield.dword)
+             else if (i.types[0].bitfield.dword)
                 i.suffix = LONG_MNEM_SUFFIX;
-             else if (i.types[0].bitfield.reg && i.types[0].bitfield.qword)
+             else if (i.types[0].bitfield.qword)
                 i.suffix = QWORD_MNEM_SUFFIX;
             }
-         else if (i.tm.base_opcode == 0xf20f38f0)
-           {
-             if (i.types[0].bitfield.reg && i.types[0].bitfield.byte)
-               i.suffix = BYTE_MNEM_SUFFIX;
-           }
  
           if (!i.suffix)
             {
               int op;
  
-             if (i.tm.base_opcode == 0xf20f38f1
-                 || i.tm.base_opcode == 0xf20f38f0)
+             if (i.tm.base_opcode == 0xf20f38f0)
                 {
                   /* We have to know the operand size for crc32.  */
                   as_bad (_("ambiguous memory operand size for `%s`"),
@@ -5840,10 +6247,10 @@ process_suffix (void)
                 }
  
               for (op = i.operands; --op >= 0;)
-               if (!i.tm.operand_types[op].bitfield.inoutportreg
-                   && !i.tm.operand_types[op].bitfield.shiftcount)
+               if (i.tm.operand_types[op].bitfield.instance == InstanceNone
+                   || i.tm.operand_types[op].bitfield.instance == Accum)
                   {
-                   if (!i.types[op].bitfield.reg)
+                   if (i.types[op].bitfield.class != Reg)
                       continue;
                     if (i.types[op].bitfield.byte)
                       i.suffix = BYTE_MNEM_SUFFIX;
@@ -5908,15 +6315,34 @@ process_suffix (void)
    else if (i.tm.opcode_modifier.defaultsize
            && !i.suffix
            /* exclude fldenv/frstor/fsave/fstenv */
-          && i.tm.opcode_modifier.no_ssuf)
+          && i.tm.opcode_modifier.no_ssuf
+          /* exclude sysret */
+          && i.tm.base_opcode != 0x0f07)
      {
        i.suffix = stackop_size;
+      if (stackop_size == LONG_MNEM_SUFFIX)
+       {
+         /* stackop_size is set to LONG_MNEM_SUFFIX for the
+            .code16gcc directive to support 16-bit mode with
+            32-bit address.  For IRET without a suffix, generate
+            16-bit IRET (opcode 0xcf) to return from an interrupt
+            handler.  */
+         if (i.tm.base_opcode == 0xcf)
+           {
+             i.suffix = WORD_MNEM_SUFFIX;
+             as_warn (_("generating 16-bit `iret' for .code16gcc directive"));
+           }
+         /* Warn about changed behavior for segment register push/pop.  */
+         else if ((i.tm.base_opcode | 1) == 0x07)
+           as_warn (_("generating 32-bit `%s', unlike earlier gas versions"),
+                    i.tm.name);
+       }
      }
    else if (intel_syntax
            && !i.suffix
-          && (i.tm.operand_types[0].bitfield.jumpabsolute
-              || i.tm.opcode_modifier.jumpbyte
-              || i.tm.opcode_modifier.jumpintersegment
+          && (i.tm.opcode_modifier.jump == JUMP_ABSOLUTE
+              || i.tm.opcode_modifier.jump == JUMP_BYTE
+              || i.tm.opcode_modifier.jump == JUMP_INTERSEGMENT
                || (i.tm.base_opcode == 0x0f01 /* [ls][gi]dt */
                    && i.tm.extension_opcode <= 3)))
      {
@@ -6006,9 +6432,9 @@ process_suffix (void)
          size prefix, except for instructions that will ignore this
          prefix anyway.  */
        if (i.reg_operands > 0
-         && i.types[0].bitfield.reg
+         && i.types[0].bitfield.class == Reg
           && i.tm.opcode_modifier.addrprefixopreg
-         && (i.tm.opcode_modifier.immext
+         && (i.tm.operand_types[0].bitfield.instance == Accum
               || i.operands == 1))
         {
           /* The address size override prefix changes the size of the
@@ -6023,13 +6449,14 @@ process_suffix (void)
        else if (i.suffix != QWORD_MNEM_SUFFIX
                && !i.tm.opcode_modifier.ignoresize
                && !i.tm.opcode_modifier.floatmf
+              && !is_any_vex_encoding (&i.tm)
                && ((i.suffix == LONG_MNEM_SUFFIX) == (flag_code == CODE_16BIT)
                    || (flag_code == CODE_64BIT
-                      && i.tm.opcode_modifier.jumpbyte)))
+                      && i.tm.opcode_modifier.jump == JUMP_BYTE)))
         {
           unsigned int prefix = DATA_PREFIX_OPCODE;
  
-         if (i.tm.opcode_modifier.jumpbyte) /* jcxz, loop */
+         if (i.tm.opcode_modifier.jump == JUMP_BYTE) /* jcxz, loop */
             prefix = ADDR_PREFIX_OPCODE;
  
           if (!add_prefix (prefix))
@@ -6045,8 +6472,10 @@ process_suffix (void)
           && ! (i.operands == 2
                 && i.tm.base_opcode == 0x90
                 && i.tm.extension_opcode == None
-               && operand_type_equal (&i.types [0], &acc64)
-               && operand_type_equal (&i.types [1], &acc64)))
+               && i.types[0].bitfield.instance == Accum
+               && i.types[0].bitfield.qword
+               && i.types[1].bitfield.instance == Accum
+               && i.types[1].bitfield.qword))
         i.rex |= REX_W;
  
        break;
@@ -6055,7 +6484,7 @@ process_suffix (void)
    if (i.reg_operands != 0
        && i.operands > 1
        && i.tm.opcode_modifier.addrprefixopreg
-      && !i.tm.opcode_modifier.immext)
+      && i.tm.operand_types[0].bitfield.instance != Accum)
      {
        /* Check invalid register operand when the address size override
          prefix changes the size of register operands.  */
@@ -6073,7 +6502,7 @@ process_suffix (void)
         }
  
        for (op = 0; op < i.operands; op++)
-       if (i.types[op].bitfield.reg
+       if (i.types[op].bitfield.class == Reg
             && ((need == need_word
                  && !i.op[op].regs->reg_type.bitfield.word)
                 || (need == need_dword
@@ -6098,7 +6527,7 @@ check_byte_reg (void)
    for (op = i.operands; --op >= 0;)
      {
        /* Skip non-register operands. */
-      if (!i.types[op].bitfield.reg)
+      if (i.types[op].bitfield.class != Reg)
         continue;
  
        /* If this is an eight bit register, it's OK.  If it's the 16 or
@@ -6108,7 +6537,8 @@ check_byte_reg (void)
         continue;
  
        /* I/O port address operands are OK too.  */
-      if (i.tm.operand_types[op].bitfield.inoutportreg)
+      if (i.tm.operand_types[op].bitfield.instance == RegD
+         && i.tm.operand_types[op].bitfield.word)
         continue;
  
        /* crc32 doesn't generate this warning.  */
@@ -6137,14 +6567,13 @@ check_byte_reg (void)
           continue;
         }
        /* Any other register is bad.  */
-      if (i.types[op].bitfield.reg
-         || i.types[op].bitfield.regmmx
-         || i.types[op].bitfield.regsimd
-         || i.types[op].bitfield.sreg2
-         || i.types[op].bitfield.sreg3
-         || i.types[op].bitfield.control
-         || i.types[op].bitfield.debug
-         || i.types[op].bitfield.test)
+      if (i.types[op].bitfield.class == Reg
+         || i.types[op].bitfield.class == RegMMX
+         || i.types[op].bitfield.class == RegSIMD
+         || i.types[op].bitfield.class == SReg
+         || i.types[op].bitfield.class == RegCR
+         || i.types[op].bitfield.class == RegDR
+         || i.types[op].bitfield.class == RegTR)
         {
           as_bad (_("`%s%s' not allowed with `%s%c'"),
                   register_prefix,
@@ -6164,13 +6593,13 @@ check_long_reg (void)
  
    for (op = i.operands; --op >= 0;)
      /* Skip non-register operands. */
-    if (!i.types[op].bitfield.reg)
+    if (i.types[op].bitfield.class != Reg)
        continue;
      /* Reject eight bit registers, except where the template requires
         them. (eg. movzb)  */
      else if (i.types[op].bitfield.byte
-            && (i.tm.operand_types[op].bitfield.reg
-                || i.tm.operand_types[op].bitfield.acc)
+            && (i.tm.operand_types[op].bitfield.class == Reg
+                || i.tm.operand_types[op].bitfield.instance == Accum)
              && (i.tm.operand_types[op].bitfield.word
                  || i.tm.operand_types[op].bitfield.dword))
        {
@@ -6184,8 +6613,8 @@ check_long_reg (void)
      /* Warn if the e prefix on a general reg is missing.  */
      else if ((!quiet_warnings || flag_code == CODE_64BIT)
              && i.types[op].bitfield.word
-            && (i.tm.operand_types[op].bitfield.reg
-                || i.tm.operand_types[op].bitfield.acc)
+            && (i.tm.operand_types[op].bitfield.class == Reg
+                || i.tm.operand_types[op].bitfield.instance == Accum)
              && i.tm.operand_types[op].bitfield.dword)
        {
         /* Prohibit these changes in the 64bit mode, since the
@@ -6206,13 +6635,13 @@ check_long_reg (void)
        }
      /* Warn if the r prefix on a general reg is present.  */
      else if (i.types[op].bitfield.qword
-            && (i.tm.operand_types[op].bitfield.reg
-                || i.tm.operand_types[op].bitfield.acc)
+            && (i.tm.operand_types[op].bitfield.class == Reg
+                || i.tm.operand_types[op].bitfield.instance == Accum)
              && i.tm.operand_types[op].bitfield.dword)
        {
         if (intel_syntax
             && i.tm.opcode_modifier.toqword
-           && !i.types[0].bitfield.regsimd)
+           && i.types[0].bitfield.class != RegSIMD)
           {
             /* Convert to QWORD.  We want REX byte. */
             i.suffix = QWORD_MNEM_SUFFIX;
@@ -6235,13 +6664,13 @@ check_qword_reg (void)
  
    for (op = i.operands; --op >= 0; )
      /* Skip non-register operands. */
-    if (!i.types[op].bitfield.reg)
+    if (i.types[op].bitfield.class != Reg)
        continue;
      /* Reject eight bit registers, except where the template requires
         them. (eg. movzb)  */
      else if (i.types[op].bitfield.byte
-            && (i.tm.operand_types[op].bitfield.reg
-                || i.tm.operand_types[op].bitfield.acc)
+            && (i.tm.operand_types[op].bitfield.class == Reg
+                || i.tm.operand_types[op].bitfield.instance == Accum)
              && (i.tm.operand_types[op].bitfield.word
                  || i.tm.operand_types[op].bitfield.dword))
        {
@@ -6255,15 +6684,15 @@ check_qword_reg (void)
      /* Warn if the r prefix on a general reg is missing.  */
      else if ((i.types[op].bitfield.word
               || i.types[op].bitfield.dword)
-            && (i.tm.operand_types[op].bitfield.reg
-                || i.tm.operand_types[op].bitfield.acc)
+            && (i.tm.operand_types[op].bitfield.class == Reg
+                || i.tm.operand_types[op].bitfield.instance == Accum)
              && i.tm.operand_types[op].bitfield.qword)
        {
         /* Prohibit these changes in the 64bit mode, since the
            lowering is more complicated.  */
         if (intel_syntax
             && i.tm.opcode_modifier.todword
-           && !i.types[0].bitfield.regsimd)
+           && i.types[0].bitfield.class != RegSIMD)
           {
             /* Convert to DWORD.  We don't want REX byte. */
             i.suffix = LONG_MNEM_SUFFIX;
@@ -6285,13 +6714,13 @@ check_word_reg (void)
    int op;
    for (op = i.operands; --op >= 0;)
      /* Skip non-register operands. */
-    if (!i.types[op].bitfield.reg)
+    if (i.types[op].bitfield.class != Reg)
        continue;
      /* Reject eight bit registers, except where the template requires
         them. (eg. movzb)  */
      else if (i.types[op].bitfield.byte
-            && (i.tm.operand_types[op].bitfield.reg
-                || i.tm.operand_types[op].bitfield.acc)
+            && (i.tm.operand_types[op].bitfield.class == Reg
+                || i.tm.operand_types[op].bitfield.instance == Accum)
              && (i.tm.operand_types[op].bitfield.word
                  || i.tm.operand_types[op].bitfield.dword))
        {
@@ -6306,8 +6735,8 @@ check_word_reg (void)
      else if ((!quiet_warnings || flag_code == CODE_64BIT)
              && (i.types[op].bitfield.dword
                  || i.types[op].bitfield.qword)
-            && (i.tm.operand_types[op].bitfield.reg
-                || i.tm.operand_types[op].bitfield.acc)
+            && (i.tm.operand_types[op].bitfield.class == Reg
+                || i.tm.operand_types[op].bitfield.instance == Accum)
              && i.tm.operand_types[op].bitfield.word)
        {
         /* Prohibit these changes in the 64bit mode, since the
@@ -6432,15 +6861,15 @@ process_operands (void)
                   && MAX_OPERANDS > dupl
                   && operand_type_equal (&i.types[dest], &regxmm));
  
-      if (i.tm.operand_types[0].bitfield.acc
+      if (i.tm.operand_types[0].bitfield.instance == Accum
           && i.tm.operand_types[0].bitfield.xmmword)
         {
           if (i.tm.opcode_modifier.vexsources == VEX3SOURCES)
             {
               /* Keep xmm0 for instructions with VEX prefix and 3
                  sources.  */
-             i.tm.operand_types[0].bitfield.acc = 0;
-             i.tm.operand_types[0].bitfield.regsimd = 1;
+             i.tm.operand_types[0].bitfield.instance = InstanceNone;
+             i.tm.operand_types[0].bitfield.class = RegSIMD;
               goto duplicate;
             }
           else
@@ -6453,6 +6882,7 @@ process_operands (void)
                   i.op[j - 1] = i.op[j];
                   i.types[j - 1] = i.types[j];
                   i.tm.operand_types[j - 1] = i.tm.operand_types[j];
+                 i.flags[j - 1] = i.flags[j];
                 }
             }
         }
@@ -6469,6 +6899,7 @@ process_operands (void)
               i.op[j] = i.op[j - 1];
               i.types[j] = i.types[j - 1];
               i.tm.operand_types[j] = i.tm.operand_types[j - 1];
+             i.flags[j] = i.flags[j - 1];
             }
           i.op[0].regs
             = (const reg_entry *) hash_find (reg_hash, "xmm0");
@@ -6484,6 +6915,7 @@ process_operands (void)
           i.op[dupl] = i.op[dest];
           i.types[dupl] = i.types[dest];
           i.tm.operand_types[dupl] = i.tm.operand_types[dest];
+         i.flags[dupl] = i.flags[dest];
         }
        else
         {
@@ -6495,12 +6927,13 @@ duplicate:
           i.op[dupl] = i.op[dest];
           i.types[dupl] = i.types[dest];
           i.tm.operand_types[dupl] = i.tm.operand_types[dest];
+         i.flags[dupl] = i.flags[dest];
         }
  
         if (i.tm.opcode_modifier.immext)
          process_immext ();
      }
-  else if (i.tm.operand_types[0].bitfield.acc
+  else if (i.tm.operand_types[0].bitfield.instance == Accum
            && i.tm.operand_types[0].bitfield.xmmword)
      {
        unsigned int j;
@@ -6513,6 +6946,8 @@ duplicate:
           /* We need to adjust fields in i.tm since they are used by
              build_modrm_byte.  */
           i.tm.operand_types [j - 1] = i.tm.operand_types [j];
+
+         i.flags[j - 1] = i.flags[j];
         }
  
        i.operands--;
@@ -6524,7 +6959,7 @@ duplicate:
        unsigned int regnum, first_reg_in_group, last_reg_in_group;
  
        /* The second operand must be {x,y,z}mmN, where N is a multiple of 4. */
-      gas_assert (i.operands >= 2 && i.types[1].bitfield.regsimd);
+      gas_assert (i.operands >= 2 && i.types[1].bitfield.class == RegSIMD);
        regnum = register_number (i.op[1].regs);
        first_reg_in_group = regnum & ~3;
        last_reg_in_group = first_reg_in_group + 3;
@@ -6557,57 +6992,7 @@ duplicate:
        i.reg_operands++;
      }
  
-  if (i.tm.opcode_modifier.shortform)
-    {
-      if (i.types[0].bitfield.sreg2
-         || i.types[0].bitfield.sreg3)
-       {
-         if (i.tm.base_opcode == POP_SEG_SHORT
-             && i.op[0].regs->reg_num == 1)
-           {
-             as_bad (_("you can't `pop %scs'"), register_prefix);
-             return 0;
-           }
-         i.tm.base_opcode |= (i.op[0].regs->reg_num << 3);
-         if ((i.op[0].regs->reg_flags & RegRex) != 0)
-           i.rex |= REX_B;
-       }
-      else
-       {
-         /* The register or float register operand is in operand
-            0 or 1.  */
-         unsigned int op;
-
-         if ((i.types[0].bitfield.reg && i.types[0].bitfield.tbyte)
-             || operand_type_check (i.types[0], reg))
-           op = 0;
-         else
-           op = 1;
-         /* Register goes in low 3 bits of opcode.  */
-         i.tm.base_opcode |= i.op[op].regs->reg_num;
-         if ((i.op[op].regs->reg_flags & RegRex) != 0)
-           i.rex |= REX_B;
-         if (!quiet_warnings && i.tm.opcode_modifier.ugh)
-           {
-             /* Warn about some common errors, but press on regardless.
-                The first case can be generated by gcc (<= 2.8.1).  */
-             if (i.operands == 2)
-               {
-                 /* Reversed arguments on faddp, fsubp, etc.  */
-                 as_warn (_("translating to `%s %s%s,%s%s'"), i.tm.name,
-                          register_prefix, i.op[!intel_syntax].regs->reg_name,
-                          register_prefix, i.op[intel_syntax].regs->reg_name);
-               }
-             else
-               {
-                 /* Extraneous `l' suffix on fp insn.  */
-                 as_warn (_("translating to `%s %s%s'"), i.tm.name,
-                          register_prefix, i.op[0].regs->reg_name);
-               }
-           }
-       }
-    }
-  else if (i.tm.opcode_modifier.modrm)
+  if (i.tm.opcode_modifier.modrm)
      {
        /* The opcode is completed (modulo i.tm.extension_opcode which
          must be put into the modrm byte).  Now, we make the modrm and
@@ -6615,6 +7000,25 @@ duplicate:
  
        default_seg = build_modrm_byte ();
      }
+  else if (i.types[0].bitfield.class == SReg)
+    {
+      if (flag_code != CODE_64BIT
+         ? i.tm.base_opcode == POP_SEG_SHORT
+           && i.op[0].regs->reg_num == 1
+         : (i.tm.base_opcode | 1) == POP_SEG386_SHORT
+           && i.op[0].regs->reg_num < 4)
+       {
+         as_bad (_("you can't `%s %s%s'"),
+                 i.tm.name, register_prefix, i.op[0].regs->reg_name);
+         return 0;
+       }
+      if ( i.op[0].regs->reg_num > 3 && i.tm.opcode_length == 1 )
+       {
+         i.tm.base_opcode ^= POP_SEG_SHORT ^ POP_SEG386_SHORT;
+         i.tm.opcode_length = 2;
+       }
+      i.tm.base_opcode |= (i.op[0].regs->reg_num << 3);
+    }
    else if ((i.tm.base_opcode & ~0x3) == MOV_AX_DISP32)
      {
        default_seg = &ds;
@@ -6625,6 +7029,35 @@ duplicate:
          on one of their operands, the default segment is ds.  */
        default_seg = &ds;
      }
+  else if (i.tm.opcode_modifier.shortform)
+    {
+      /* The register or float register operand is in operand
+        0 or 1.  */
+      unsigned int op = i.tm.operand_types[0].bitfield.class != Reg;
+
+      /* Register goes in low 3 bits of opcode.  */
+      i.tm.base_opcode |= i.op[op].regs->reg_num;
+      if ((i.op[op].regs->reg_flags & RegRex) != 0)
+       i.rex |= REX_B;
+      if (!quiet_warnings && i.tm.opcode_modifier.ugh)
+       {
+         /* Warn about some common errors, but press on regardless.
+            The first case can be generated by gcc (<= 2.8.1).  */
+         if (i.operands == 2)
+           {
+             /* Reversed arguments on faddp, fsubp, etc.  */
+             as_warn (_("translating to `%s %s%s,%s%s'"), i.tm.name,
+                      register_prefix, i.op[!intel_syntax].regs->reg_name,
+                      register_prefix, i.op[intel_syntax].regs->reg_name);
+           }
+         else
+           {
+             /* Extraneous `l' suffix on fp insn.  */
+             as_warn (_("translating to `%s %s%s'"), i.tm.name,
+                      register_prefix, i.op[0].regs->reg_name);
+           }
+       }
+    }
  
    if (i.tm.base_opcode == 0x8d /* lea */
        && i.seg[0]
@@ -6662,7 +7095,7 @@ build_modrm_byte (void)
  
        /* There are 2 kinds of instructions:
          1. 5 operands: 4 register operands or 3 register operands
-        plus 1 memory operand plus one Vec_Imm4 operand, VexXDS, and
+        plus 1 memory operand plus one Imm4 operand, VexXDS, and
          VexW0 or VexW1.  The destination must be either XMM, YMM or
          ZMM register.
          2. 4 operands: 4 register operands or 3 register operands
@@ -6671,7 +7104,7 @@ build_modrm_byte (void)
                    || (i.reg_operands == 3 && i.mem_operands == 1))
                   && i.tm.opcode_modifier.vexvvvv == VEXXDS
                   && i.tm.opcode_modifier.vexw
-                 && i.tm.operand_types[dest].bitfield.regsimd);
+                 && i.tm.operand_types[dest].bitfield.class == RegSIMD);
  
        /* If VexW1 is set, the first non-immediate operand is the source and
          the second non-immediate one is encoded in the immediate operand.  */
@@ -6695,40 +7128,27 @@ build_modrm_byte (void)
           i.types[i.operands] = imm8;
           i.operands++;
  
-         gas_assert (i.tm.operand_types[reg_slot].bitfield.regsimd);
+         gas_assert (i.tm.operand_types[reg_slot].bitfield.class == RegSIMD);
           exp->X_op = O_constant;
           exp->X_add_number = register_number (i.op[reg_slot].regs) << 4;
           gas_assert ((i.op[reg_slot].regs->reg_flags & RegVRex) == 0);
         }
        else
         {
-         unsigned int imm_slot;
+         gas_assert (i.imm_operands == 1);
+         gas_assert (fits_in_imm4 (i.op[0].imms->X_add_number));
+         gas_assert (!i.tm.opcode_modifier.immext);
  
-         gas_assert (i.imm_operands == 1 && i.types[0].bitfield.vec_imm4);
+         /* Turn on Imm8 again so that output_imm will generate it.  */
+         i.types[0].bitfield.imm8 = 1;
  
-         if (i.tm.opcode_modifier.immext)
-           {
-             /* When ImmExt is set, the immediate byte is the last
-                operand.  */
-             imm_slot = i.operands - 1;
-             source--;
-             reg_slot--;
-           }
-         else
-           {
-             imm_slot = 0;
-
-             /* Turn on Imm8 so that output_imm will generate it.  */
-             i.types[imm_slot].bitfield.imm8 = 1;
-           }
-
-         gas_assert (i.tm.operand_types[reg_slot].bitfield.regsimd);
-         i.op[imm_slot].imms->X_add_number
+         gas_assert (i.tm.operand_types[reg_slot].bitfield.class == RegSIMD);
+         i.op[0].imms->X_add_number
               |= register_number (i.op[reg_slot].regs) << 4;
           gas_assert ((i.op[reg_slot].regs->reg_flags & RegVRex) == 0);
         }
  
-      gas_assert (i.tm.operand_types[nds].bitfield.regsimd);
+      gas_assert (i.tm.operand_types[nds].bitfield.class == RegSIMD);
        i.vex.register_specifier = i.op[nds].regs;
      }
    else
@@ -6760,9 +7180,11 @@ build_modrm_byte (void)
           gas_assert (i.imm_operands == 1
                       || (i.imm_operands == 0
                           && (i.tm.opcode_modifier.vexvvvv == VEXXDS
-                             || i.types[0].bitfield.shiftcount)));
+                             || (i.types[0].bitfield.instance == RegC
+                                 && i.types[0].bitfield.byte))));
           if (operand_type_check (i.types[0], imm)
-             || i.types[0].bitfield.shiftcount)
+             || (i.types[0].bitfield.instance == RegC
+                 && i.types[0].bitfield.byte))
             source = 1;
           else
             source = 0;
@@ -6830,8 +7252,7 @@ build_modrm_byte (void)
             {
               /* For instructions with VexNDS, the register-only source
                  operand must be a 32/64bit integer, XMM, YMM, ZMM, or mask
-                register.  It is encoded in VEX prefix.  We need to
-                clear RegMem bit before calling operand_type_equal.  */
+                register.  It is encoded in VEX prefix.  */
  
               i386_operand_type op;
               unsigned int vvvv;
@@ -6848,11 +7269,10 @@ build_modrm_byte (void)
                 vvvv = dest;
  
               op = i.tm.operand_types[vvvv];
-             op.bitfield.regmem = 0;
               if ((dest + 1) >= i.operands
-                 || ((!op.bitfield.reg
+                 || ((op.bitfield.class != Reg
                        || (!op.bitfield.dword && !op.bitfield.qword))
-                     && !op.bitfield.regsimd
+                     && op.bitfield.class != RegSIMD
                       && !operand_type_equal (&op, &regmask)))
                 abort ();
               i.vex.register_specifier = i.op[vvvv].regs;
@@ -6861,17 +7281,32 @@ build_modrm_byte (void)
         }
  
        i.rm.mode = 3;
-      /* One of the register operands will be encoded in the i.tm.reg
-        field, the other in the combined i.tm.mode and i.tm.regmem
+      /* One of the register operands will be encoded in the i.rm.reg
+        field, the other in the combined i.rm.mode and i.rm.regmem
          fields.  If no form of this instruction supports a memory
          destination operand, then we assume the source operand may
          sometimes be a memory operand and so we need to store the
          destination in the i.rm.reg field.  */
-      if (!i.tm.operand_types[dest].bitfield.regmem
+      if (!i.tm.opcode_modifier.regmem
           && operand_type_check (i.tm.operand_types[dest], anymem) == 0)
         {
           i.rm.reg = i.op[dest].regs->reg_num;
           i.rm.regmem = i.op[source].regs->reg_num;
+         if (i.op[dest].regs->reg_type.bitfield.class == RegMMX
+              || i.op[source].regs->reg_type.bitfield.class == RegMMX)
+           i.has_regmmx = TRUE;
+         else if (i.op[dest].regs->reg_type.bitfield.class == RegSIMD
+                  || i.op[source].regs->reg_type.bitfield.class == RegSIMD)
+           {
+             if (i.types[dest].bitfield.zmmword
+                 || i.types[source].bitfield.zmmword)
+               i.has_regzmm = TRUE;
+             else if (i.types[dest].bitfield.ymmword
+                      || i.types[source].bitfield.ymmword)
+               i.has_regymm = TRUE;
+             else
+               i.has_regxmm = TRUE;
+           }
           if ((i.op[dest].regs->reg_flags & RegRex) != 0)
             i.rex |= REX_R;
           if ((i.op[dest].regs->reg_flags & RegVRex) != 0)
@@ -6896,7 +7331,7 @@ build_modrm_byte (void)
         }
        if (flag_code != CODE_64BIT && (i.rex & REX_R))
         {
-         if (!i.types[i.tm.operand_types[0].bitfield.regmem].bitfield.control)
+         if (i.types[!i.tm.opcode_modifier.regmem].bitfield.class != RegCR)
             abort ();
           i.rex &= ~REX_R;
           add_prefix (LOCK_PREFIX_OPCODE);
@@ -6912,14 +7347,13 @@ build_modrm_byte (void)
           unsigned int op;
  
           for (op = 0; op < i.operands; op++)
-           if (operand_type_check (i.types[op], anymem))
+           if (i.flags[op] & Operand_Mem)
               break;
           gas_assert (op < i.operands);
  
           if (i.tm.opcode_modifier.vecsib)
             {
-             if (i.index_reg->reg_num == RegEiz
-                 || i.index_reg->reg_num == RegRiz)
+             if (i.index_reg->reg_num == RegIZ)
                 abort ();
  
               i.rm.regmem = ESCAPE_TO_TWO_BYTE_ADDRESSING;
@@ -6990,8 +7424,7 @@ build_modrm_byte (void)
               else if (!i.tm.opcode_modifier.vecsib)
                 {
                   /* !i.base_reg && i.index_reg  */
-                 if (i.index_reg->reg_num == RegEiz
-                     || i.index_reg->reg_num == RegRiz)
+                 if (i.index_reg->reg_num == RegIZ)
                     i.sib.index = NO_INDEX_REGISTER;
                   else
                     i.sib.index = i.index_reg->reg_num;
@@ -7017,8 +7450,7 @@ build_modrm_byte (void)
                 }
             }
           /* RIP addressing for 64bit mode.  */
-         else if (i.base_reg->reg_num == RegRip ||
-                  i.base_reg->reg_num == RegEip)
+         else if (i.base_reg->reg_num == RegIP)
             {
               gas_assert (!i.tm.opcode_modifier.vecsib);
               i.rm.regmem = NO_BASE_REGISTER;
@@ -7110,8 +7542,7 @@ build_modrm_byte (void)
                 }
               else if (!i.tm.opcode_modifier.vecsib)
                 {
-                 if (i.index_reg->reg_num == RegEiz
-                     || i.index_reg->reg_num == RegRiz)
+                 if (i.index_reg->reg_num == RegIZ)
                     i.sib.index = NO_INDEX_REGISTER;
                   else
                     i.sib.index = i.index_reg->reg_num;
@@ -7215,17 +7646,31 @@ build_modrm_byte (void)
           unsigned int vex_reg = ~0;
  
           for (op = 0; op < i.operands; op++)
-           if (i.types[op].bitfield.reg
-               || i.types[op].bitfield.regmmx
-               || i.types[op].bitfield.regsimd
-               || i.types[op].bitfield.regbnd
-               || i.types[op].bitfield.regmask
-               || i.types[op].bitfield.sreg2
-               || i.types[op].bitfield.sreg3
-               || i.types[op].bitfield.control
-               || i.types[op].bitfield.debug
-               || i.types[op].bitfield.test)
-             break;
+           {
+             if (i.types[op].bitfield.class == Reg
+                 || i.types[op].bitfield.class == RegBND
+                 || i.types[op].bitfield.class == RegMask
+                 || i.types[op].bitfield.class == SReg
+                 || i.types[op].bitfield.class == RegCR
+                 || i.types[op].bitfield.class == RegDR
+                 || i.types[op].bitfield.class == RegTR)
+               break;
+             if (i.types[op].bitfield.class == RegSIMD)
+               {
+                 if (i.types[op].bitfield.zmmword)
+                   i.has_regzmm = TRUE;
+                 else if (i.types[op].bitfield.ymmword)
+                   i.has_regymm = TRUE;
+                 else
+                   i.has_regxmm = TRUE;
+                 break;
+               }
+             if (i.types[op].bitfield.class == RegMMX)
+               {
+                 i.has_regmmx = TRUE;
+                 break;
+               }
+           }
  
           if (vex_3_sources)
             op = dest;
@@ -7284,9 +7729,9 @@ build_modrm_byte (void)
             {
               i386_operand_type *type = &i.tm.operand_types[vex_reg];
  
-             if ((!type->bitfield.reg
+             if ((type->bitfield.class != Reg
                    || (!type->bitfield.dword && !type->bitfield.qword))
-                 && !type->bitfield.regsimd
+                 && type->bitfield.class != RegSIMD
                   && !operand_type_equal (type, &regmask))
                 abort ();
  
@@ -7426,6 +7871,12 @@ need_plt32_p (symbolS *s)
    if (!IS_ELF)
      return FALSE;
  
+#ifdef TE_SOLARIS
+  /* Don't emit PLT32 relocation on Solaris: neither native linker nor
+     krtld support it.  */
+  return FALSE;
+#endif
+
    /* Since there is no need to prepare for PLT branch on x86-64, we
       can generate R_X86_64_PLT32, instead of R_X86_64_PC32, which can
       be used as a marker for 32-bit PC-relative branches.  */
@@ -7456,7 +7907,7 @@ output_jump (void)
    fixS *fixP;
    bfd_reloc_code_real_type jump_reloc = i.reloc[0];
  
-  if (i.tm.opcode_modifier.jumpbyte)
+  if (i.tm.opcode_modifier.jump == JUMP_BYTE)
      {
        /* This is a loop or jecxz type instruction.  */
        size = 1;
@@ -7605,12 +8056,233 @@ output_interseg_jump (void)
    md_number_to_chars (p + size, (valueT) i.op[0].imms->X_add_number, 2);
  }
  
+#if defined (OBJ_ELF) || defined (OBJ_MAYBE_ELF)
+void
+x86_cleanup (void)
+{
+  char *p;
+  asection *seg = now_seg;
+  subsegT subseg = now_subseg;
+  asection *sec;
+  unsigned int alignment, align_size_1;
+  unsigned int isa_1_descsz, feature_2_descsz, descsz;
+  unsigned int isa_1_descsz_raw, feature_2_descsz_raw;
+  unsigned int padding;
+
+  if (!IS_ELF || !x86_used_note)
+    return;
+
+  x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_X86;
+
+  /* The .note.gnu.property section layout:
+
+     Field     Length          Contents
+     ----      ----            ----
+     n_namsz   4               4
+     n_descsz  4               The note descriptor size
+     n_type    4               NT_GNU_PROPERTY_TYPE_0
+     n_name    4               "GNU"
+     n_desc    n_descsz        The program property array
+     ....      ....            ....
+   */
+
+  /* Create the .note.gnu.property section.  */
+  sec = subseg_new (NOTE_GNU_PROPERTY_SECTION_NAME, 0);
+  bfd_set_section_flags (sec,
+                        (SEC_ALLOC
+                         | SEC_LOAD
+                         | SEC_DATA
+                         | SEC_HAS_CONTENTS
+                         | SEC_READONLY));
+
+  if (get_elf_backend_data (stdoutput)->s->elfclass == ELFCLASS64)
+    {
+      align_size_1 = 7;
+      alignment = 3;
+    }
+  else
+    {
+      align_size_1 = 3;
+      alignment = 2;
+    }
+
+  bfd_set_section_alignment (sec, alignment);
+  elf_section_type (sec) = SHT_NOTE;
+
+  /* GNU_PROPERTY_X86_ISA_1_USED: 4-byte type + 4-byte data size
+                                 + 4-byte data  */
+  isa_1_descsz_raw = 4 + 4 + 4;
+  /* Align GNU_PROPERTY_X86_ISA_1_USED.  */
+  isa_1_descsz = (isa_1_descsz_raw + align_size_1) & ~align_size_1;
+
+  feature_2_descsz_raw = isa_1_descsz;
+  /* GNU_PROPERTY_X86_FEATURE_2_USED: 4-byte type + 4-byte data size
+                                     + 4-byte data  */
+  feature_2_descsz_raw += 4 + 4 + 4;
+  /* Align GNU_PROPERTY_X86_FEATURE_2_USED.  */
+  feature_2_descsz = ((feature_2_descsz_raw + align_size_1)
+                     & ~align_size_1);
+
+  descsz = feature_2_descsz;
+  /* Section size: n_namsz + n_descsz + n_type + n_name + n_descsz.  */
+  p = frag_more (4 + 4 + 4 + 4 + descsz);
+
+  /* Write n_namsz.  */
+  md_number_to_chars (p, (valueT) 4, 4);
+
+  /* Write n_descsz.  */
+  md_number_to_chars (p + 4, (valueT) descsz, 4);
+
+  /* Write n_type.  */
+  md_number_to_chars (p + 4 * 2, (valueT) NT_GNU_PROPERTY_TYPE_0, 4);
+
+  /* Write n_name.  */
+  memcpy (p + 4 * 3, "GNU", 4);
+
+  /* Write 4-byte type.  */
+  md_number_to_chars (p + 4 * 4,
+                     (valueT) GNU_PROPERTY_X86_ISA_1_USED, 4);
+
+  /* Write 4-byte data size.  */
+  md_number_to_chars (p + 4 * 5, (valueT) 4, 4);
+
+  /* Write 4-byte data.  */
+  md_number_to_chars (p + 4 * 6, (valueT) x86_isa_1_used, 4);
+
+  /* Zero out paddings.  */
+  padding = isa_1_descsz - isa_1_descsz_raw;
+  if (padding)
+    memset (p + 4 * 7, 0, padding);
+
+  /* Write 4-byte type.  */
+  md_number_to_chars (p + isa_1_descsz + 4 * 4,
+                     (valueT) GNU_PROPERTY_X86_FEATURE_2_USED, 4);
+
+  /* Write 4-byte data size.  */
+  md_number_to_chars (p + isa_1_descsz + 4 * 5, (valueT) 4, 4);
+
+  /* Write 4-byte data.  */
+  md_number_to_chars (p + isa_1_descsz + 4 * 6,
+                     (valueT) x86_feature_2_used, 4);
+
+  /* Zero out paddings.  */
+  padding = feature_2_descsz - feature_2_descsz_raw;
+  if (padding)
+    memset (p + isa_1_descsz + 4 * 7, 0, padding);
+
+  /* We probably can't restore the current segment, for there likely
+     isn't one yet...  */
+  if (seg && subseg)
+    subseg_set (seg, subseg);
+}
+#endif
+
+static unsigned int
+encoding_length (const fragS *start_frag, offsetT start_off,
+                const char *frag_now_ptr)
+{
+  unsigned int len = 0;
+
+  if (start_frag != frag_now)
+    {
+      const fragS *fr = start_frag;
+
+      do {
+       len += fr->fr_fix;
+       fr = fr->fr_next;
+      } while (fr && fr != frag_now);
+    }
+
+  return len - start_off + (frag_now_ptr - frag_now->fr_literal);
+}
+
  static void
  output_insn (void)
  {
    fragS *insn_start_frag;
    offsetT insn_start_off;
  
+#if defined (OBJ_ELF) || defined (OBJ_MAYBE_ELF)
+  if (IS_ELF && x86_used_note)
+    {
+      if (i.tm.cpu_flags.bitfield.cpucmov)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_CMOV;
+      if (i.tm.cpu_flags.bitfield.cpusse)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_SSE;
+      if (i.tm.cpu_flags.bitfield.cpusse2)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_SSE2;
+      if (i.tm.cpu_flags.bitfield.cpusse3)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_SSE3;
+      if (i.tm.cpu_flags.bitfield.cpussse3)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_SSSE3;
+      if (i.tm.cpu_flags.bitfield.cpusse4_1)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_SSE4_1;
+      if (i.tm.cpu_flags.bitfield.cpusse4_2)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_SSE4_2;
+      if (i.tm.cpu_flags.bitfield.cpuavx)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX;
+      if (i.tm.cpu_flags.bitfield.cpuavx2)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX2;
+      if (i.tm.cpu_flags.bitfield.cpufma)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_FMA;
+      if (i.tm.cpu_flags.bitfield.cpuavx512f)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX512F;
+      if (i.tm.cpu_flags.bitfield.cpuavx512cd)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX512CD;
+      if (i.tm.cpu_flags.bitfield.cpuavx512er)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX512ER;
+      if (i.tm.cpu_flags.bitfield.cpuavx512pf)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX512PF;
+      if (i.tm.cpu_flags.bitfield.cpuavx512vl)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX512VL;
+      if (i.tm.cpu_flags.bitfield.cpuavx512dq)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX512DQ;
+      if (i.tm.cpu_flags.bitfield.cpuavx512bw)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX512BW;
+      if (i.tm.cpu_flags.bitfield.cpuavx512_4fmaps)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX512_4FMAPS;
+      if (i.tm.cpu_flags.bitfield.cpuavx512_4vnniw)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX512_4VNNIW;
+      if (i.tm.cpu_flags.bitfield.cpuavx512_bitalg)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX512_BITALG;
+      if (i.tm.cpu_flags.bitfield.cpuavx512ifma)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX512_IFMA;
+      if (i.tm.cpu_flags.bitfield.cpuavx512vbmi)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX512_VBMI;
+      if (i.tm.cpu_flags.bitfield.cpuavx512_vbmi2)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX512_VBMI2;
+      if (i.tm.cpu_flags.bitfield.cpuavx512_vnni)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX512_VNNI;
+      if (i.tm.cpu_flags.bitfield.cpuavx512_bf16)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX512_BF16;
+
+      if (i.tm.cpu_flags.bitfield.cpu8087
+         || i.tm.cpu_flags.bitfield.cpu287
+         || i.tm.cpu_flags.bitfield.cpu387
+         || i.tm.cpu_flags.bitfield.cpu687
+         || i.tm.cpu_flags.bitfield.cpufisttp)
+       x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_X87;
+      if (i.has_regmmx
+         || i.tm.base_opcode == 0xf77 /* emms */
+         || i.tm.base_opcode == 0xf0e /* femms */)
+       x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_MMX;
+      if (i.has_regxmm)
+       x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_XMM;
+      if (i.has_regymm)
+       x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_YMM;
+      if (i.has_regzmm)
+       x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_ZMM;
+      if (i.tm.cpu_flags.bitfield.cpufxsr)
+       x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_FXSR;
+      if (i.tm.cpu_flags.bitfield.cpuxsave)
+       x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_XSAVE;
+      if (i.tm.cpu_flags.bitfield.cpuxsaveopt)
+       x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_XSAVEOPT;
+      if (i.tm.cpu_flags.bitfield.cpuxsavec)
+       x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_XSAVEC;
+    }
+#endif
+
    /* Tie dwarf2 debug info to the address at the start of the insn.
       We can't do this after the insn has been output as the current
       frag may have been closed off.  eg. by frag_var.  */
@@ -7620,12 +8292,12 @@ output_insn (void)
    insn_start_off = frag_now_fix ();
  
    /* Output jumps.  */
-  if (i.tm.opcode_modifier.jump)
+  if (i.tm.opcode_modifier.jump == JUMP)
      output_branch ();
-  else if (i.tm.opcode_modifier.jumpbyte
-          || i.tm.opcode_modifier.jumpdword)
+  else if (i.tm.opcode_modifier.jump == JUMP_BYTE
+          || i.tm.opcode_modifier.jump == JUMP_DWORD)
      output_jump ();
-  else if (i.tm.opcode_modifier.jumpintersegment)
+  else if (i.tm.opcode_modifier.jump == JUMP_INTERSEGMENT)
      output_interseg_jump ();
    else
      {
@@ -7636,12 +8308,9 @@ output_insn (void)
        unsigned int prefix;
  
        if (avoid_fence
-         && i.tm.base_opcode == 0xfae
-         && i.operands == 1
-         && i.imm_operands == 1
-         && (i.op[0].imms->X_add_number == 0xe8
-             || i.op[0].imms->X_add_number == 0xf0
-             || i.op[0].imms->X_add_number == 0xf8))
+         && (i.tm.base_opcode == 0xfaee8
+             || i.tm.base_opcode == 0xfaef0
+             || i.tm.base_opcode == 0xfaef8))
          {
            /* Encode lfence, mfence, and sfence as
               f0 83 04 24 00   lock addl $0x0, (%{re}sp).  */
@@ -7670,23 +8339,17 @@ output_insn (void)
               if (i.tm.base_opcode & 0xff000000)
                 {
                   prefix = (i.tm.base_opcode >> 24) & 0xff;
-                 goto check_prefix;
+                 if (!i.tm.cpu_flags.bitfield.cpupadlock
+                     || prefix != REPE_PREFIX_OPCODE
+                     || (i.prefix[REP_PREFIX] != REPE_PREFIX_OPCODE))
+                   add_prefix (prefix);
                 }
               break;
             case 2:
               if ((i.tm.base_opcode & 0xff0000) != 0)
                 {
                   prefix = (i.tm.base_opcode >> 16) & 0xff;
-                 if (i.tm.cpu_flags.bitfield.cpupadlock)
-                   {
-check_prefix:
-                     if (prefix != REPE_PREFIX_OPCODE
-                         || (i.prefix[REP_PREFIX]
-                             != REPE_PREFIX_OPCODE))
-                       add_prefix (prefix);
-                   }
-                 else
-                   add_prefix (prefix);
+                 add_prefix (prefix);
                 }
               break;
             case 1:
@@ -7801,6 +8464,19 @@ check_prefix:
  
        if (i.imm_operands)
         output_imm (insn_start_frag, insn_start_off);
+
+      /*
+       * frag_now_fix () returning plain abs_section_offset when we're in the
+       * absolute section, and abs_section_offset not getting updated as data
+       * gets added to the frag breaks the logic below.
+       */
+      if (now_seg != absolute_section)
+       {
+         j = encoding_length (insn_start_frag, insn_start_off, frag_more (0));
+         if (j > 15)
+           as_warn (_("instruction length of %u bytes exceeds the limit of 15"),
+                    j);
+       }
      }
  
  #ifdef DEBUG386
@@ -7857,7 +8533,8 @@ output_disp (fragS *insn_start_frag, offsetT insn_start_off)
               int size = disp_size (n);
               offsetT val = i.op[n].disps->X_add_number;
  
-             val = offset_in_range (val >> i.memshift, size);
+             val = offset_in_range (val >> (size == 1 ? i.memshift : 0),
+                                    size);
               p = frag_more (size);
               md_number_to_chars (p, val, size);
             }
@@ -7908,25 +8585,11 @@ output_disp (fragS *insn_start_frag, offsetT insn_start_off)
                                    == O_subtract))))
                       || reloc_type == BFD_RELOC_32_PCREL))
                 {
-                 offsetT add;
-
-                 if (insn_start_frag == frag_now)
-                   add = (p - frag_now->fr_literal) - insn_start_off;
-                 else
-                   {
-                     fragS *fr;
-
-                     add = insn_start_frag->fr_fix - insn_start_off;
-                     for (fr = insn_start_frag->fr_next;
-                          fr && fr != frag_now; fr = fr->fr_next)
-                       add += fr->fr_fix;
-                     add += p - frag_now->fr_literal;
-                   }
-
                   if (!object_64bit)
                     {
                       reloc_type = BFD_RELOC_386_GOTPC;
-                     i.op[n].imms->X_add_number += add;
+                     i.op[n].imms->X_add_number +=
+                       encoding_length (insn_start_frag, insn_start_off, p);
                     }
                   else if (reloc_type == BFD_RELOC_64)
                     reloc_type = BFD_RELOC_X86_64_GOTPC64;
@@ -7942,12 +8605,13 @@ output_disp (fragS *insn_start_frag, offsetT insn_start_off)
               /* Check for "call/jmp *mem", "mov mem, %reg",
                  "test %reg, mem" and "binop mem, %reg" where binop
                  is one of adc, add, and, cmp, or, sbb, sub, xor
-                instructions.  Always generate R_386_GOT32X for
-                "sym*GOT" operand in 32-bit mode.  */
-             if ((generate_relax_relocations
-                  || (!object_64bit
-                      && i.rm.mode == 0
-                      && i.rm.regmem == 5))
+                instructions without data prefix.  Always generate
+                R_386_GOT32X for "sym*GOT" operand in 32-bit mode.  */
+             if (i.prefix[DATA_PREFIX] == 0
+                 && (generate_relax_relocations
+                     || (!object_64bit
+                         && i.rm.mode == 0
+                         && i.rm.regmem == 5))
                   && (i.rm.mode == 2
                       || (i.rm.mode == 0 && i.rm.regmem == 5))
                   && ((i.operands == 1
@@ -7962,8 +8626,7 @@ output_disp (fragS *insn_start_frag, offsetT insn_start_off)
                     {
                       fixP->fx_tcbit = i.rex != 0;
                       if (i.base_reg
-                         && (i.base_reg->reg_num == RegRip
-                             || i.base_reg->reg_num == RegEip))
+                         && (i.base_reg->reg_num == RegIP))
                       fixP->fx_tcbit2 = 1;
                     }
                   else
@@ -8071,28 +8734,14 @@ output_imm (fragS *insn_start_frag, offsetT insn_start_off)
                                (i.op[n].imms->X_op_symbol)->X_op)
                               == O_subtract))))
                 {
-                 offsetT add;
-
-                 if (insn_start_frag == frag_now)
-                   add = (p - frag_now->fr_literal) - insn_start_off;
-                 else
-                   {
-                     fragS *fr;
-
-                     add = insn_start_frag->fr_fix - insn_start_off;
-                     for (fr = insn_start_frag->fr_next;
-                          fr && fr != frag_now; fr = fr->fr_next)
-                       add += fr->fr_fix;
-                     add += p - frag_now->fr_literal;
-                   }
-
                   if (!object_64bit)
                     reloc_type = BFD_RELOC_386_GOTPC;
                   else if (size == 4)
                     reloc_type = BFD_RELOC_X86_64_GOTPC32;
                   else if (size == 8)
                     reloc_type = BFD_RELOC_X86_64_GOTPC64;
-                 i.op[n].imms->X_add_number += add;
+                 i.op[n].imms->X_add_number +=
+                   encoding_length (insn_start_frag, insn_start_off, p);
                 }
               fix_new_exp (frag_now, p - frag_now->fr_literal, size,
                            i.op[n].imms, 0, reloc_type);
@@ -8443,6 +9092,15 @@ x86_cons (expressionS *exp, int size)
               as_bad (_("missing or invalid expression `%s'"), save);
               *input_line_pointer = c;
             }
+         else if ((got_reloc == BFD_RELOC_386_PLT32
+                   || got_reloc == BFD_RELOC_X86_64_PLT32)
+                  && exp->X_op != O_symbol)
+           {
+             char c = *input_line_pointer;
+             *input_line_pointer = 0;
+             as_bad (_("invalid PLT expression `%s'"), save);
+             *input_line_pointer = c;
+           }
         }
      }
    else
@@ -8533,13 +9191,14 @@ check_VecOperations (char *op_string, char *op_end)
  
               broadcast_op.type = bcst_type;
               broadcast_op.operand = this_operand;
+             broadcast_op.bytes = 0;
               i.broadcast = &broadcast_op;
             }
           /* Check masking operation.  */
           else if ((mask = parse_register (op_string, &end_op)) != NULL)
             {
               /* k0 can't be used for write mask.  */
-             if (!mask->reg_type.bitfield.regmask || mask->reg_num == 0)
+             if (mask->reg_type.bitfield.class != RegMask || !mask->reg_num)
                 {
                   as_bad (_("`%s%s' can't be used for write mask"),
                           register_prefix, mask->reg_name);
@@ -8817,9 +9476,9 @@ i386_displacement (char *disp_start, char *disp_end)
      }
  
    operand_type_set (&bigdisp, 0);
-  if ((i.types[this_operand].bitfield.jumpabsolute)
-      || (!current_templates->start->opcode_modifier.jump
-         && !current_templates->start->opcode_modifier.jumpdword))
+  if (i.jumpabsolute
+      || (current_templates->start->opcode_modifier.jump != JUMP
+         && current_templates->start->opcode_modifier.jump != JUMP_DWORD))
      {
        bigdisp.bitfield.disp32 = 1;
        override = (i.prefix[ADDR_PREFIX] != 0);
@@ -9063,9 +9722,7 @@ i386_addressing_mode (void)
  
           if (addr_reg)
             {
-             if (addr_reg->reg_num == RegEip
-                 || addr_reg->reg_num == RegEiz
-                 || addr_reg->reg_type.bitfield.dword)
+             if (addr_reg->reg_type.bitfield.dword)
                 addr_mode = CODE_32BIT;
               else if (flag_code != CODE_64BIT
                        && addr_reg->reg_type.bitfield.word)
@@ -9105,7 +9762,7 @@ i386_index_check (const char *operand_string)
    enum flag_code addr_mode = i386_addressing_mode ();
  
    if (current_templates->start->opcode_modifier.isstring
-      && !current_templates->start->opcode_modifier.immext
+      && !current_templates->start->cpu_flags.bitfield.cpupadlock
        && (current_templates->end[-1].opcode_modifier.isstring
           || i.mem_operands))
      {
@@ -9124,16 +9781,16 @@ i386_index_check (const char *operand_string)
  
        if (current_templates->start->opcode_modifier.repprefixok)
         {
-         i386_operand_type type = current_templates->end[-1].operand_types[0];
+         int es_op = current_templates->end[-1].opcode_modifier.isstring
+                     - IS_STRING_ES_OP0;
+         int op = 0;
  
-         if (!type.bitfield.baseindex
+         if (!current_templates->end[-1].operand_types[0].bitfield.baseindex
               || ((!i.mem_operands != !intel_syntax)
                   && current_templates->end[-1].operand_types[1]
                      .bitfield.baseindex))
-           type = current_templates->end[-1].operand_types[1];
-         expected_reg = hash_find (reg_hash,
-                                   di_si[addr_mode][type.bitfield.esseg]);
-
+           op = 1;
+         expected_reg = hash_find (reg_hash, di_si[addr_mode][op == es_op]);
         }
        else
         expected_reg = hash_find (reg_hash, bx[addr_mode]);
@@ -9175,21 +9832,18 @@ bad_address:
         {
           /* 32-bit/64-bit checks.  */
           if ((i.base_reg
-              && (addr_mode == CODE_64BIT
-                  ? !i.base_reg->reg_type.bitfield.qword
-                  : !i.base_reg->reg_type.bitfield.dword)
-              && (i.index_reg
-                  || (i.base_reg->reg_num
-                      != (addr_mode == CODE_64BIT ? RegRip : RegEip))))
+              && ((addr_mode == CODE_64BIT
+                   ? !i.base_reg->reg_type.bitfield.qword
+                   : !i.base_reg->reg_type.bitfield.dword)
+                  || (i.index_reg && i.base_reg->reg_num == RegIP)
+                  || i.base_reg->reg_num == RegIZ))
               || (i.index_reg
                   && !i.index_reg->reg_type.bitfield.xmmword
                   && !i.index_reg->reg_type.bitfield.ymmword
                   && !i.index_reg->reg_type.bitfield.zmmword
                   && ((addr_mode == CODE_64BIT
-                      ? !(i.index_reg->reg_type.bitfield.qword
-                          || i.index_reg->reg_num == RegRiz)
-                      : !(i.index_reg->reg_type.bitfield.dword
-                          || i.index_reg->reg_num == RegEiz))
+                      ? !i.index_reg->reg_type.bitfield.qword
+                      : !i.index_reg->reg_type.bitfield.dword)
                       || !i.index_reg->reg_type.bitfield.baseindex)))
             goto bad_address;
  
@@ -9198,7 +9852,7 @@ bad_address:
               || (current_templates->start->base_opcode & ~1) == 0x0f1a)
             {
               /* They cannot use RIP-relative addressing. */
-             if (i.base_reg && i.base_reg->reg_num == RegRip)
+             if (i.base_reg && i.base_reg->reg_num == RegIP)
                 {
                   as_bad (_("`%s' cannot be used here"), operand_string);
                   return 0;
@@ -9353,7 +10007,7 @@ i386_att_operand (char *operand_string)
        ++op_string;
        if (is_space_char (*op_string))
         ++op_string;
-      i.types[this_operand].bitfield.jumpabsolute = 1;
+      i.jumpabsolute = TRUE;
      }
  
    /* Check if operand is a register.  */
@@ -9366,9 +10020,7 @@ i386_att_operand (char *operand_string)
        op_string = end_op;
        if (is_space_char (*op_string))
         ++op_string;
-      if (*op_string == ':'
-         && (r->reg_type.bitfield.sreg2
-             || r->reg_type.bitfield.sreg3))
+      if (*op_string == ':' && r->reg_type.bitfield.class == SReg)
         {
           switch (r->reg_num)
             {
@@ -9411,7 +10063,7 @@ i386_att_operand (char *operand_string)
               ++op_string;
               if (is_space_char (*op_string))
                 ++op_string;
-             i.types[this_operand].bitfield.jumpabsolute = 1;
+             i.jumpabsolute = TRUE;
             }
           goto do_memory_reference;
         }
@@ -9445,7 +10097,7 @@ i386_att_operand (char *operand_string)
    else if (*op_string == IMMEDIATE_PREFIX)
      {
        ++op_string;
-      if (i.types[this_operand].bitfield.jumpabsolute)
+      if (i.jumpabsolute)
         {
           as_bad (_("immediate operand illegal with absolute jump"));
           return 0;
@@ -9637,20 +10289,20 @@ i386_att_operand (char *operand_string)
  
        /* Special case for (%dx) while doing input/output op.  */
        if (i.base_reg
-         && operand_type_equal (&i.base_reg->reg_type,
-                                &reg16_inoutportreg)
+         && i.base_reg->reg_type.bitfield.instance == RegD
+         && i.base_reg->reg_type.bitfield.word
           && i.index_reg == 0
           && i.log2_scale_factor == 0
           && i.seg[i.mem_operands] == 0
           && !operand_type_check (i.types[this_operand], disp))
         {
-         i.types[this_operand] = inoutportreg;
+         i.types[this_operand] = i.base_reg->reg_type;
           return 1;
         }
  
        if (i386_index_check (operand_string) == 0)
         return 0;
-      i.types[this_operand].bitfield.mem = 1;
+      i.flags[this_operand] |= Operand_Mem;
        if (i.mem_operands == 0)
         i.memop1_string = xstrdup (operand_string);
        i.mem_operands++;
@@ -10062,9 +10714,11 @@ md_apply_fix (fixS *fixP, valueT *valP, segT seg ATTRIBUTE_UNUSED)
        {
        case BFD_RELOC_386_PLT32:
        case BFD_RELOC_X86_64_PLT32:
-       /* Make the jump instruction point to the address of the operand.  At
-          runtime we merely add the offset to the actual PLT entry.  */
-       value = -4;
+       /* Make the jump instruction point to the address of the operand.
+          At runtime we merely add the offset to the actual PLT entry.
+          NB: Subtract the offset size only for jump instructions.  */
+       if (fixP->fx_pcrel)
+         value = -4;
         break;
  
        case BFD_RELOC_386_TLS_GD:
@@ -10230,19 +10884,20 @@ parse_real_register (char *reg_string, char **end_op)
      return (const reg_entry *) NULL;
  
    if ((r->reg_type.bitfield.dword
-       || r->reg_type.bitfield.sreg3
-       || r->reg_type.bitfield.control
-       || r->reg_type.bitfield.debug
-       || r->reg_type.bitfield.test)
+       || (r->reg_type.bitfield.class == SReg && r->reg_num > 3)
+       || r->reg_type.bitfield.class == RegCR
+       || r->reg_type.bitfield.class == RegDR
+       || r->reg_type.bitfield.class == RegTR)
        && !cpu_arch_flags.bitfield.cpui386)
      return (const reg_entry *) NULL;
  
-  if (r->reg_type.bitfield.regmmx && !cpu_arch_flags.bitfield.cpummx)
+  if (r->reg_type.bitfield.class == RegMMX && !cpu_arch_flags.bitfield.cpummx)
      return (const reg_entry *) NULL;
  
    if (!cpu_arch_flags.bitfield.cpuavx512f)
      {
-      if (r->reg_type.bitfield.zmmword || r->reg_type.bitfield.regmask)
+      if (r->reg_type.bitfield.zmmword
+         || r->reg_type.bitfield.class == RegMask)
         return (const reg_entry *) NULL;
  
        if (!cpu_arch_flags.bitfield.cpuavx)
@@ -10255,19 +10910,18 @@ parse_real_register (char *reg_string, char **end_op)
         }
      }
  
-  if (r->reg_type.bitfield.regbnd && !cpu_arch_flags.bitfield.cpumpx)
+  if (r->reg_type.bitfield.class == RegBND && !cpu_arch_flags.bitfield.cpumpx)
      return (const reg_entry *) NULL;
  
    /* Don't allow fake index register unless allow_index_reg isn't 0. */
-  if (!allow_index_reg
-      && (r->reg_num == RegEiz || r->reg_num == RegRiz))
+  if (!allow_index_reg && r->reg_num == RegIZ)
      return (const reg_entry *) NULL;
  
    /* Upper 16 vector registers are only available with VREX in 64bit
       mode, and require EVEX encoding.  */
    if (r->reg_flags & RegVRex)
      {
-      if (!cpu_arch_flags.bitfield.cpuvrex
+      if (!cpu_arch_flags.bitfield.cpuavx512f
           || flag_code != CODE_64BIT)
         return (const reg_entry *) NULL;
  
@@ -10275,11 +10929,12 @@ parse_real_register (char *reg_string, char **end_op)
      }
  
    if (((r->reg_flags & (RegRex64 | RegRex)) || r->reg_type.bitfield.qword)
-      && (!cpu_arch_flags.bitfield.cpulm || !r->reg_type.bitfield.control)
+      && (!cpu_arch_flags.bitfield.cpulm || r->reg_type.bitfield.class != RegCR)
        && flag_code != CODE_64BIT)
      return (const reg_entry *) NULL;
  
-  if (r->reg_type.bitfield.sreg3 && r->reg_num == RegFlat && !intel_syntax)
+  if (r->reg_type.bitfield.class == SReg && r->reg_num == RegFlat
+      && !intel_syntax)
      return (const reg_entry *) NULL;
  
    return r;
@@ -10415,6 +11070,8 @@ const char *md_shortopts = "qnO::";
  #define OPTION_MAMD64 (OPTION_MD_BASE + 22)
  #define OPTION_MINTEL64 (OPTION_MD_BASE + 23)
  #define OPTION_MFENCE_AS_LOCK_ADD (OPTION_MD_BASE + 24)
+#define OPTION_X86_USED_NOTE (OPTION_MD_BASE + 25)
+#define OPTION_MVEXWIG (OPTION_MD_BASE + 26)
  
  struct option md_longopts[] =
  {
@@ -10426,6 +11083,7 @@ struct option md_longopts[] =
  #if defined (OBJ_ELF) || defined (OBJ_MAYBE_ELF)
    {"x32", no_argument, NULL, OPTION_X32},
    {"mshared", no_argument, NULL, OPTION_MSHARED},
+  {"mx86-used-note", required_argument, NULL, OPTION_X86_USED_NOTE},
  #endif
    {"divide", no_argument, NULL, OPTION_DIVIDE},
    {"march", required_argument, NULL, OPTION_MARCH},
@@ -10438,6 +11096,7 @@ struct option md_longopts[] =
    {"msse-check", required_argument, NULL, OPTION_MSSE_CHECK},
    {"moperand-check", required_argument, NULL, OPTION_MOPERAND_CHECK},
    {"mavxscalar", required_argument, NULL, OPTION_MAVXSCALAR},
+  {"mvexwig", required_argument, NULL, OPTION_MVEXWIG},
    {"madd-bnd-prefix", no_argument, NULL, OPTION_MADD_BND_PREFIX},
    {"mevexlig", required_argument, NULL, OPTION_MEVEXLIG},
    {"mevexwig", required_argument, NULL, OPTION_MEVEXWIG},
@@ -10474,6 +11133,8 @@ md_parse_option (int c, const char *arg)
        /* -Qy, -Qn: SVR4 arguments controlling whether a .comment section
          should be emitted or not.  FIXME: Not implemented.  */
      case 'Q':
+      if ((arg[0] != 'y' && arg[0] != 'n') || arg[1])
+       return 0;
        break;
  
        /* -V: SVR4 argument to print version ID.  */
@@ -10493,6 +11154,17 @@ md_parse_option (int c, const char *arg)
      case OPTION_MSHARED:
        shared = 1;
        break;
+
+    case OPTION_X86_USED_NOTE:
+      if (strcasecmp (arg, "yes") == 0)
+        x86_used_note = 1;
+      else if (strcasecmp (arg, "no") == 0)
+        x86_used_note = 0;
+      else
+        as_fatal (_("invalid -mx86-used-note= option: `%s'"), arg);
+      break;
+
+
  #endif
  #if (defined (OBJ_ELF) || defined (OBJ_MAYBE_ELF) \
       || defined (TE_PE) || defined (TE_PEP) || defined (OBJ_MACH_O))
@@ -10744,6 +11416,15 @@ md_parse_option (int c, const char *arg)
         as_fatal (_("invalid -mavxscalar= option: `%s'"), arg);
        break;
  
+    case OPTION_MVEXWIG:
+      if (strcmp (arg, "0") == 0)
+       vexwig = vexw0;
+      else if (strcmp (arg, "1") == 0)
+       vexwig = vexw1;
+      else
+       as_fatal (_("invalid -mvexwig= option: `%s'"), arg);
+      break;
+
      case OPTION_MADD_BND_PREFIX:
        add_bnd_prefix = 1;
        break;
@@ -10833,7 +11514,7 @@ md_parse_option (int c, const char *arg)
         {
           optimize_for_space = 1;
           /* Turn on all encoding optimizations.  */
-         optimize = -1;
+         optimize = INT_MAX;
         }
        else
         {
@@ -10956,7 +11637,7 @@ md_show_usage (FILE *stream)
  {
  #if defined (OBJ_ELF) || defined (OBJ_MAYBE_ELF)
    fprintf (stream, _("\
-  -Q                      ignored\n\
+  -Qy, -Qn                ignored\n\
    -V                      print assembler version number\n\
    -k                      ignored\n"));
  #endif
@@ -10967,8 +11648,8 @@ md_show_usage (FILE *stream)
    fprintf (stream, _("\
    -s                      ignored\n"));
  #endif
-#if (defined (OBJ_ELF) || defined (OBJ_MAYBE_ELF) \
-     || defined (TE_PE) || defined (TE_PEP))
+#if defined BFD64 && (defined (OBJ_ELF) || defined (OBJ_MAYBE_ELF) \
+                     || defined (TE_PE) || defined (TE_PEP))
    fprintf (stream, _("\
    --32/--64/--x32         generate 32bit/64bit/x32 code\n"));
  #endif
@@ -10992,52 +11673,81 @@ md_show_usage (FILE *stream)
    fprintf (stream, _("\
    -msse2avx               encode SSE instructions with VEX prefix\n"));
    fprintf (stream, _("\
-  -msse-check=[none|error|warning]\n\
+  -msse-check=[none|error|warning] (default: warning)\n\
                            check SSE instructions\n"));
    fprintf (stream, _("\
-  -moperand-check=[none|error|warning]\n\
+  -moperand-check=[none|error|warning] (default: warning)\n\
                            check operand combinations for validity\n"));
    fprintf (stream, _("\
-  -mavxscalar=[128|256]   encode scalar AVX instructions with specific vector\n\
+  -mavxscalar=[128|256] (default: 128)\n\
+                          encode scalar AVX instructions with specific vector\n\
                             length\n"));
    fprintf (stream, _("\
-  -mevexlig=[128|256|512] encode scalar EVEX instructions with specific vector\n\
+  -mvexwig=[0|1] (default: 0)\n\
+                          encode VEX instructions with specific VEX.W value\n\
+                           for VEX.W bit ignored instructions\n"));
+  fprintf (stream, _("\
+  -mevexlig=[128|256|512] (default: 128)\n\
+                          encode scalar EVEX instructions with specific vector\n\
                             length\n"));
    fprintf (stream, _("\
-  -mevexwig=[0|1]         encode EVEX instructions with specific EVEX.W value\n\
+  -mevexwig=[0|1] (default: 0)\n\
+                          encode EVEX instructions with specific EVEX.W value\n\
                             for EVEX.W bit ignored instructions\n"));
    fprintf (stream, _("\
-  -mevexrcig=[rne|rd|ru|rz]\n\
+  -mevexrcig=[rne|rd|ru|rz] (default: rne)\n\
                            encode EVEX instructions with specific EVEX.RC value\n\
                             for SAE-only ignored instructions\n"));
    fprintf (stream, _("\
-  -mmnemonic=[att|intel]  use AT&T/Intel mnemonic\n"));
+  -mmnemonic=[att|intel] "));
+  if (SYSV386_COMPAT)
+    fprintf (stream, _("(default: att)\n"));
+  else
+    fprintf (stream, _("(default: intel)\n"));
+  fprintf (stream, _("\
+                          use AT&T/Intel mnemonic\n"));
    fprintf (stream, _("\
-  -msyntax=[att|intel]    use AT&T/Intel syntax\n"));
+  -msyntax=[att|intel] (default: att)\n\
+                          use AT&T/Intel syntax\n"));
    fprintf (stream, _("\
    -mindex-reg             support pseudo index registers\n"));
    fprintf (stream, _("\
    -mnaked-reg             don't require `%%' prefix for registers\n"));
    fprintf (stream, _("\
    -madd-bnd-prefix        add BND prefix for all valid branches\n"));
+#if defined (OBJ_ELF) || defined (OBJ_MAYBE_ELF)
    fprintf (stream, _("\
    -mshared                disable branch optimization for shared code\n"));
-# if defined (TE_PE) || defined (TE_PEP)
+  fprintf (stream, _("\
+  -mx86-used-note=[no|yes] "));
+  if (DEFAULT_X86_USED_NOTE)
+    fprintf (stream, _("(default: yes)\n"));
+  else
+    fprintf (stream, _("(default: no)\n"));
+  fprintf (stream, _("\
+                          generate x86 used ISA and feature properties\n"));
+#endif
+#if defined (TE_PE) || defined (TE_PEP)
    fprintf (stream, _("\
    -mbig-obj               generate big object files\n"));
  #endif
    fprintf (stream, _("\
-  -momit-lock-prefix=[no|yes]\n\
+  -momit-lock-prefix=[no|yes] (default: no)\n\
                            strip all lock prefixes\n"));
    fprintf (stream, _("\
-  -mfence-as-lock-add=[no|yes]\n\
+  -mfence-as-lock-add=[no|yes] (default: no)\n\
                            encode lfence, mfence and sfence as\n\
                             lock addl $0x0, (%%{re}sp)\n"));
    fprintf (stream, _("\
-  -mrelax-relocations=[no|yes]\n\
+  -mrelax-relocations=[no|yes] "));
+  if (DEFAULT_GENERATE_X86_RELAX_RELOCATIONS)
+    fprintf (stream, _("(default: yes)\n"));
+  else
+    fprintf (stream, _("(default: no)\n"));
+  fprintf (stream, _("\
                            generate relax relocations\n"));
    fprintf (stream, _("\
-  -mamd64                 accept only AMD64 ISA\n"));
+  -mamd64                 accept only AMD64 ISA [default]\n"));
    fprintf (stream, _("\
    -mintel64               accept only Intel64 ISA\n"));
  }
@@ -11209,7 +11919,7 @@ md_section_align (segT segment ATTRIBUTE_UNUSED, valueT size)
          work.  */
        int align;
  
-      align = bfd_get_section_alignment (stdoutput, segment);
+      align = bfd_section_alignment (segment);
        size = ((size + (1 << align) - 1) & (-((valueT) 1 << align)));
      }
  #endif
@@ -11652,8 +12362,7 @@ handle_large_common (int small ATTRIBUTE_UNUSED)
           /* The .lbss section is for local .largecomm symbols.  */
           lbss_section = subseg_new (".lbss", 0);
           applicable = bfd_applicable_section_flags (stdoutput);
-         bfd_set_section_flags (stdoutput, lbss_section,
-                                applicable & SEC_ALLOC);
+         bfd_set_section_flags (lbss_section, applicable & SEC_ALLOC);
           seg_info (lbss_section)->bss = 1;
  
           subseg_set (seg, subseg);