x86: Determine vector length from the last vector operand

[deliverable/binutils-gdb.git] / gas / config / tc-i386.c
diff --git a/gas/config/tc-i386.c b/gas/config/tc-i386.c

index cc1071517f4ae27f60aa168015ef33a456eb61bf..575b017af42a9f6c16f46470495a096a7dbef034 100644 (file)
--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -225,7 +225,7 @@ static struct Mask_Operation mask_op;
     broadcast factor.  */
  struct Broadcast_Operation
  {
-  /* Type of broadcast: no broadcast, {1to8}, or {1to16}.  */
+  /* Type of broadcast: {1to2}, {1to4}, {1to8}, or {1to16}.  */
    int type;
  
    /* Index of broadcasted operand.  */
@@ -262,7 +262,6 @@ enum i386_error
      number_of_operands_mismatch,
      invalid_instruction_suffix,
      bad_imm4,
-    old_gcc_only,
      unsupported_with_intel_mnemonic,
      unsupported_syntax,
      unsupported,
@@ -433,7 +432,6 @@ const char extra_symbol_chars[] = "*%-([{}"
          && !defined (TE_GNU)                           \
          && !defined (TE_LINUX)                         \
          && !defined (TE_NACL)                          \
-        && !defined (TE_NETWARE)                       \
          && !defined (TE_FreeBSD)                       \
          && !defined (TE_DragonFly)                     \
          && !defined (TE_NetBSD)))
@@ -562,9 +560,6 @@ static int intel64;
     0 if att mnemonic.  */
  static int intel_mnemonic = !SYSV386_COMPAT;
  
-/* 1 if support old (<= 2.8.1) versions of gcc.  */
-static int old_gcc = OLDGCC_COMPAT;
-
  /* 1 if pseudo registers are permitted.  */
  static int allow_pseudo_reg = 0;
  
@@ -848,6 +843,8 @@ static const arch_entry cpu_arch[] =
      CPU_BDVER4_FLAGS, 0 },
    { STRING_COMMA_LEN ("znver1"), PROCESSOR_ZNVER,
      CPU_ZNVER1_FLAGS, 0 },
+  { STRING_COMMA_LEN ("znver2"), PROCESSOR_ZNVER,
+    CPU_ZNVER2_FLAGS, 0 },
    { STRING_COMMA_LEN ("btver1"), PROCESSOR_BT,
      CPU_BTVER1_FLAGS, 0 },
    { STRING_COMMA_LEN ("btver2"), PROCESSOR_BT,
@@ -1030,6 +1027,14 @@ static const arch_entry cpu_arch[] =
      CPU_WBNOINVD_FLAGS, 0 },
    { STRING_COMMA_LEN (".pconfig"), PROCESSOR_UNKNOWN,
      CPU_PCONFIG_FLAGS, 0 },
+  { STRING_COMMA_LEN (".waitpkg"), PROCESSOR_UNKNOWN,
+    CPU_WAITPKG_FLAGS, 0 },
+  { STRING_COMMA_LEN (".cldemote"), PROCESSOR_UNKNOWN,
+    CPU_CLDEMOTE_FLAGS, 0 },
+  { STRING_COMMA_LEN (".movdiri"), PROCESSOR_UNKNOWN,
+    CPU_MOVDIRI_FLAGS, 0 },
+  { STRING_COMMA_LEN (".movdir64b"), PROCESSOR_UNKNOWN,
+    CPU_MOVDIR64B_FLAGS, 0 },
  };
  
  static const noarch_entry cpu_noarch[] =
@@ -1065,6 +1070,8 @@ static const noarch_entry cpu_noarch[] =
    { STRING_COMMA_LEN ("noavx512_bitalg"), CPU_ANY_AVX512_BITALG_FLAGS },
    { STRING_COMMA_LEN ("noibt"), CPU_ANY_IBT_FLAGS },
    { STRING_COMMA_LEN ("noshstk"), CPU_ANY_SHSTK_FLAGS },
+  { STRING_COMMA_LEN ("nomovdiri"), CPU_ANY_MOVDIRI_FLAGS },
+  { STRING_COMMA_LEN ("nomovdir64b"), CPU_ANY_MOVDIR64B_FLAGS },
  };
  
  #ifdef I386COFF
@@ -1825,11 +1832,6 @@ operand_type_xor (i386_operand_type x, i386_operand_type y)
  
  static const i386_operand_type acc32 = OPERAND_TYPE_ACC32;
  static const i386_operand_type acc64 = OPERAND_TYPE_ACC64;
-static const i386_operand_type control = OPERAND_TYPE_CONTROL;
-static const i386_operand_type inoutportreg
-  = OPERAND_TYPE_INOUTPORTREG;
-static const i386_operand_type reg16_inoutportreg
-  = OPERAND_TYPE_REG16_INOUTPORTREG;
  static const i386_operand_type disp16 = OPERAND_TYPE_DISP16;
  static const i386_operand_type disp32 = OPERAND_TYPE_DISP32;
  static const i386_operand_type disp32s = OPERAND_TYPE_DISP32S;
@@ -1895,71 +1897,81 @@ operand_type_check (i386_operand_type t, enum operand_type c)
    return 0;
  }
  
-/* Return 1 if there is no conflict in 8bit/16bit/32bit/64bit/80bit on
-   operand J for instruction template T.  */
+/* Return 1 if there is no conflict in 8bit/16bit/32bit/64bit/80bit size
+   between operand GIVEN and opeand WANTED for instruction template T.  */
  
  static INLINE int
-match_reg_size (const insn_template *t, unsigned int j)
+match_operand_size (const insn_template *t, unsigned int wanted,
+                   unsigned int given)
  {
-  return !((i.types[j].bitfield.byte
-           && !t->operand_types[j].bitfield.byte)
-          || (i.types[j].bitfield.word
-              && !t->operand_types[j].bitfield.word)
-          || (i.types[j].bitfield.dword
-              && !t->operand_types[j].bitfield.dword)
-          || (i.types[j].bitfield.qword
-              && !t->operand_types[j].bitfield.qword)
-          || (i.types[j].bitfield.tbyte
-              && !t->operand_types[j].bitfield.tbyte));
+  return !((i.types[given].bitfield.byte
+           && !t->operand_types[wanted].bitfield.byte)
+          || (i.types[given].bitfield.word
+              && !t->operand_types[wanted].bitfield.word)
+          || (i.types[given].bitfield.dword
+              && !t->operand_types[wanted].bitfield.dword)
+          || (i.types[given].bitfield.qword
+              && !t->operand_types[wanted].bitfield.qword)
+          || (i.types[given].bitfield.tbyte
+              && !t->operand_types[wanted].bitfield.tbyte));
  }
  
-/* Return 1 if there is no conflict in SIMD register on
-   operand J for instruction template T.  */
+/* Return 1 if there is no conflict in SIMD register between operand
+   GIVEN and opeand WANTED for instruction template T.  */
  
  static INLINE int
-match_simd_size (const insn_template *t, unsigned int j)
+match_simd_size (const insn_template *t, unsigned int wanted,
+                unsigned int given)
  {
-  return !((i.types[j].bitfield.xmmword
-           && !t->operand_types[j].bitfield.xmmword)
-          || (i.types[j].bitfield.ymmword
-              && !t->operand_types[j].bitfield.ymmword)
-          || (i.types[j].bitfield.zmmword
-              && !t->operand_types[j].bitfield.zmmword));
+  return !((i.types[given].bitfield.xmmword
+           && !t->operand_types[wanted].bitfield.xmmword)
+          || (i.types[given].bitfield.ymmword
+              && !t->operand_types[wanted].bitfield.ymmword)
+          || (i.types[given].bitfield.zmmword
+              && !t->operand_types[wanted].bitfield.zmmword));
  }
  
-/* Return 1 if there is no conflict in any size on operand J for
-   instruction template T.  */
+/* Return 1 if there is no conflict in any size between operand GIVEN
+   and opeand WANTED for instruction template T.  */
  
  static INLINE int
-match_mem_size (const insn_template *t, unsigned int j)
+match_mem_size (const insn_template *t, unsigned int wanted,
+               unsigned int given)
  {
-  return (match_reg_size (t, j)
-         && !((i.types[j].bitfield.unspecified
+  return (match_operand_size (t, wanted, given)
+         && !((i.types[given].bitfield.unspecified
                 && !i.broadcast
-               && !t->operand_types[j].bitfield.unspecified)
-              || (i.types[j].bitfield.fword
-                  && !t->operand_types[j].bitfield.fword)
+               && !t->operand_types[wanted].bitfield.unspecified)
+              || (i.types[given].bitfield.fword
+                  && !t->operand_types[wanted].bitfield.fword)
                /* For scalar opcode templates to allow register and memory
                   operands at the same time, some special casing is needed
-                 here.  */
-              || ((t->operand_types[j].bitfield.regsimd
+                 here.  Also for v{,p}broadcast*, {,v}pmov{s,z}*, and
+                 down-conversion vpmov*.  */
+              || ((t->operand_types[wanted].bitfield.regsimd
                     && !t->opcode_modifier.broadcast
-                   && (t->operand_types[j].bitfield.dword
-                       || t->operand_types[j].bitfield.qword))
-                  ? (i.types[j].bitfield.xmmword
-                     || i.types[j].bitfield.ymmword
-                     || i.types[j].bitfield.zmmword)
-                  : !match_simd_size(t, j))));
+                   && (t->operand_types[wanted].bitfield.byte
+                       || t->operand_types[wanted].bitfield.word
+                       || t->operand_types[wanted].bitfield.dword
+                       || t->operand_types[wanted].bitfield.qword))
+                  ? (i.types[given].bitfield.xmmword
+                     || i.types[given].bitfield.ymmword
+                     || i.types[given].bitfield.zmmword)
+                  : !match_simd_size(t, wanted, given))));
  }
  
-/* Return 1 if there is no size conflict on any operands for
-   instruction template T.  */
+/* Return value has MATCH_STRAIGHT set if there is no size conflict on any
+   operands for instruction template T, and it has MATCH_REVERSE set if there
+   is no size conflict on any operands for the template with operands reversed
+   (and the template allows for reversing in the first place).  */
  
-static INLINE int
+#define MATCH_STRAIGHT 1
+#define MATCH_REVERSE  2
+
+static INLINE unsigned int
  operand_size_match (const insn_template *t)
  {
-  unsigned int j;
-  int match = 1;
+  unsigned int j, match = MATCH_STRAIGHT;
  
    /* Don't check jump instructions.  */
    if (t->opcode_modifier.jump
@@ -1976,59 +1988,57 @@ operand_size_match (const insn_template *t)
         continue;
  
        if (t->operand_types[j].bitfield.reg
-         && !match_reg_size (t, j))
+         && !match_operand_size (t, j, j))
         {
           match = 0;
           break;
         }
  
        if (t->operand_types[j].bitfield.regsimd
-         && !match_simd_size (t, j))
+         && !match_simd_size (t, j, j))
         {
           match = 0;
           break;
         }
  
        if (t->operand_types[j].bitfield.acc
-         && (!match_reg_size (t, j) || !match_simd_size (t, j)))
+         && (!match_operand_size (t, j, j) || !match_simd_size (t, j, j)))
         {
           match = 0;
           break;
         }
  
-      if (i.types[j].bitfield.mem && !match_mem_size (t, j))
+      if (i.types[j].bitfield.mem && !match_mem_size (t, j, j))
         {
           match = 0;
           break;
         }
      }
  
-  if (match)
-    return match;
-  else if (!t->opcode_modifier.d)
+  if (!t->opcode_modifier.d)
      {
  mismatch:
-      i.error = operand_size_mismatch;
-      return 0;
+      if (!match)
+       i.error = operand_size_mismatch;
+      return match;
      }
  
    /* Check reverse.  */
    gas_assert (i.operands == 2);
  
-  match = 1;
    for (j = 0; j < 2; j++)
      {
        if ((t->operand_types[j].bitfield.reg
            || t->operand_types[j].bitfield.acc)
-         && !match_reg_size (t, j ? 0 : 1))
+         && !match_operand_size (t, j, !j))
         goto mismatch;
  
-      if (i.types[j].bitfield.mem
-         && !match_mem_size (t, j ? 0 : 1))
+      if (i.types[!j].bitfield.mem
+         && !match_mem_size (t, j, !j))
         goto mismatch;
      }
  
-  return match;
+  return match | MATCH_REVERSE;
  }
  
  static INLINE int
@@ -2321,8 +2331,9 @@ add_prefix (unsigned int prefix)
        && flag_code == CODE_64BIT)
      {
        if ((i.prefix[REX_PREFIX] & prefix & REX_W)
-         || ((i.prefix[REX_PREFIX] & (REX_R | REX_X | REX_B))
-             && (prefix & (REX_R | REX_X | REX_B))))
+         || (i.prefix[REX_PREFIX] & prefix & REX_R)
+         || (i.prefix[REX_PREFIX] & prefix & REX_X)
+         || (i.prefix[REX_PREFIX] & prefix & REX_B))
         ret = PREFIX_EXIST;
        q = REX_PREFIX;
      }
@@ -2626,6 +2637,10 @@ set_cpu_arch (int dummy ATTRIBUTE_UNUSED)
                   cpu_arch_flags = flags;
                   cpu_arch_isa_flags = flags;
                 }
+             else
+               cpu_arch_isa_flags
+                 = cpu_flags_or (cpu_arch_isa_flags,
+                                 cpu_arch[j].flags);
               (void) restore_line_pointer (e);
               demand_empty_rest_of_line ();
               return;
@@ -3348,10 +3363,12 @@ build_vex_prefix (const insn_template *t)
      vector_length = 1;
    else
      {
-      unsigned int op;
+      int op;
  
+      /* Determine vector length from the last multi-length vector
+        operand.  */
        vector_length = 0;
-      for (op = 0; op < t->operands; ++op)
+      for (op = t->operands - 1; op >= 0; op--)
         if (t->operand_types[op].bitfield.xmmword
             && t->operand_types[op].bitfield.ymmword
             && i.types[op].bitfield.ymmword)
@@ -3451,6 +3468,14 @@ build_vex_prefix (const insn_template *t)
      }
  }
  
+static INLINE bfd_boolean
+is_evex_encoding (const insn_template *t)
+{
+  return t->opcode_modifier.evex || t->opcode_modifier.disp8memshift
+        || t->opcode_modifier.broadcast || t->opcode_modifier.masking
+        || t->opcode_modifier.staticrounding || t->opcode_modifier.sae;
+}
+
  /* Build the EVEX prefix.  */
  
  static void
@@ -3585,6 +3610,59 @@ build_evex_prefix (void)
        /* Encode the vector length.  */
        unsigned int vec_length;
  
+      if (!i.tm.opcode_modifier.evex
+         || i.tm.opcode_modifier.evex == EVEXDYN)
+       {
+         int op;
+
+         /* Determine vector length from the last multi-length vector
+            operand.  */
+         vec_length = 0;
+         for (op = i.operands - 1; op >= 0; op--)
+           if (i.tm.operand_types[op].bitfield.xmmword
+               + i.tm.operand_types[op].bitfield.ymmword
+               + i.tm.operand_types[op].bitfield.zmmword > 1)
+             {
+               if (i.types[op].bitfield.zmmword)
+                 {
+                   i.tm.opcode_modifier.evex = EVEX512;
+                   break;
+                 }
+               else if (i.types[op].bitfield.ymmword)
+                 {
+                   i.tm.opcode_modifier.evex = EVEX256;
+                   break;
+                 }
+               else if (i.types[op].bitfield.xmmword)
+                 {
+                   i.tm.opcode_modifier.evex = EVEX128;
+                   break;
+                 }
+               else if (i.broadcast && (int) op == i.broadcast->operand)
+                 {
+                   switch ((i.tm.operand_types[op].bitfield.dword ? 4 : 8)
+                           * i.broadcast->type)
+                     {
+                       case 64:
+                         i.tm.opcode_modifier.evex = EVEX512;
+                         break;
+                       case 32:
+                         i.tm.opcode_modifier.evex = EVEX256;
+                         break;
+                       case 16:
+                         i.tm.opcode_modifier.evex = EVEX128;
+                         break;
+                       default:
+                         abort ();
+                     }
+                   break;
+                 }
+             }
+
+         if (op < 0)
+           abort ();
+       }
+
        switch (i.tm.opcode_modifier.evex)
         {
         case EVEXLIG: /* LL' is ignored */
@@ -3682,7 +3760,8 @@ bad_register_operand:
    gas_assert (i.imm_operands <= 1
               && (i.operands <= 2
                   || ((i.tm.opcode_modifier.vex
-                      || i.tm.opcode_modifier.evex)
+                      || i.tm.opcode_modifier.vexopcode
+                      || is_evex_encoding (&i.tm))
                       && i.operands <= 4)));
  
    exp = &im_expressions[i.imm_operands++];
@@ -3773,7 +3852,8 @@ optimize_encoding (void)
         }
      }
    else if (flag_code == CODE_64BIT
-          && ((i.reg_operands == 1
+          && ((i.types[1].bitfield.qword
+               && i.reg_operands == 1
                 && i.imm_operands == 1
                 && i.op[0].imms->X_op == O_constant
                 && ((i.tm.base_opcode == 0xb0
@@ -3788,12 +3868,16 @@ optimize_encoding (void)
                             || ((i.tm.base_opcode == 0xf6
                                  || i.tm.base_opcode == 0xc6)
                                 && i.tm.extension_opcode == 0x0)))))
-              || (i.reg_operands == 2
-                  && i.op[0].regs == i.op[1].regs
-                  && ((i.tm.base_opcode == 0x30
-                       || i.tm.base_opcode == 0x28)
-                      && i.tm.extension_opcode == None)))
-          && i.types[1].bitfield.qword)
+              || (i.types[0].bitfield.qword
+                  && ((i.reg_operands == 2
+                       && i.op[0].regs == i.op[1].regs
+                       && ((i.tm.base_opcode == 0x30
+                            || i.tm.base_opcode == 0x28)
+                           && i.tm.extension_opcode == None))
+                      || (i.reg_operands == 1
+                          && i.operands == 1
+                          && i.tm.base_opcode == 0x30
+                          && i.tm.extension_opcode == None)))))
      {
        /* Optimize: -O:
            andq $imm31, %r64   -> andl $imm31, %r32
@@ -3835,10 +3919,14 @@ optimize_encoding (void)
            && i.op[0].regs == i.op[1].regs
            && !i.types[2].bitfield.xmmword
            && (i.tm.opcode_modifier.vex
-              || (!i.mask
+              || ((!i.mask || i.mask->zeroing)
                    && !i.rounding
-                  && i.tm.opcode_modifier.evex
-                  && cpu_arch_flags.bitfield.cpuavx512vl))
+                  && is_evex_encoding (&i.tm)
+                  && (i.vec_encoding != vex_encoding_evex
+                      || i.tm.cpu_flags.bitfield.cpuavx512vl
+                      || (i.tm.operand_types[2].bitfield.zmmword
+                          && i.types[2].bitfield.ymmword)
+                      || cpu_arch_isa_flags.bitfield.cpuavx512vl)))
            && ((i.tm.base_opcode == 0x55
                 || i.tm.base_opcode == 0x6655
                 || i.tm.base_opcode == 0x66df
@@ -3880,15 +3968,9 @@ optimize_encoding (void)
                -> VEX vpxor %xmmM, %xmmM, %xmmN (M and N < 16)
                -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
         */
-      if (i.tm.opcode_modifier.evex)
+      if (is_evex_encoding (&i.tm))
         {
-         /* If only lower 16 vector registers are used, we can use
-            VEX encoding.  */
-         for (j = 0; j < 3; j++)
-           if (register_number (i.op[j].regs) > 15)
-             break;
-
-         if (j < 3)
+         if (i.vec_encoding == vex_encoding_evex)
             i.tm.opcode_modifier.evex = EVEX128;
           else
             {
@@ -4065,10 +4147,16 @@ md_assemble (char *line)
      }
  
    /* Insert BND prefix.  */
-  if (add_bnd_prefix
-      && i.tm.opcode_modifier.bndprefixok
-      && !i.prefix[BND_PREFIX])
-    add_prefix (BND_PREFIX_OPCODE);
+  if (add_bnd_prefix && i.tm.opcode_modifier.bndprefixok)
+    {
+      if (!i.prefix[BND_PREFIX])
+       add_prefix (BND_PREFIX_OPCODE);
+      else if (i.prefix[BND_PREFIX] != BND_PREFIX_OPCODE)
+       {
+         as_warn (_("replacing `rep'/`repe' prefix by `bnd'"));
+         i.prefix[BND_PREFIX] = BND_PREFIX_OPCODE;
+       }
+    }
  
    /* Check string instruction segment overrides.  */
    if (i.tm.opcode_modifier.isstring && i.mem_operands != 0)
@@ -4122,7 +4210,8 @@ md_assemble (char *line)
        as_warn (_("translating to `%sp'"), i.tm.name);
      }
  
-  if (i.tm.opcode_modifier.vex || i.tm.opcode_modifier.evex)
+  if (i.tm.opcode_modifier.vex || i.tm.opcode_modifier.vexopcode
+      || is_evex_encoding (&i.tm))
      {
        if (flag_code == CODE_16BIT)
         {
@@ -4926,6 +5015,30 @@ static int
  check_VecOperands (const insn_template *t)
  {
    unsigned int op;
+  i386_cpu_flags cpu;
+  static const i386_cpu_flags avx512 = CPU_ANY_AVX512F_FLAGS;
+
+  /* Templates allowing for ZMMword as well as YMMword and/or XMMword for
+     any one operand are implicity requiring AVX512VL support if the actual
+     operand size is YMMword or XMMword.  Since this function runs after
+     template matching, there's no need to check for YMMword/XMMword in
+     the template.  */
+  cpu = cpu_flags_and (t->cpu_flags, avx512);
+  if (!cpu_flags_all_zero (&cpu)
+      && !t->cpu_flags.bitfield.cpuavx512vl
+      && !cpu_arch_flags.bitfield.cpuavx512vl)
+    {
+      for (op = 0; op < t->operands; ++op)
+       {
+         if (t->operand_types[op].bitfield.zmmword
+             && (i.types[op].bitfield.ymmword
+                 || i.types[op].bitfield.xmmword))
+           {
+             i.error = unsupported;
+             return 1;
+           }
+       }
+    }
  
    /* Without VSIB byte, we can't have a vector register for index.  */
    if (!t->opcode_modifier.vecsib
@@ -5011,42 +5124,61 @@ check_VecOperands (const insn_template *t)
       to the memory operand.  */
    if (i.broadcast)
      {
-      int broadcasted_opnd_size;
+      i386_operand_type type, overlap;
  
        /* Check if specified broadcast is supported in this instruction,
-        and it's applied to memory operand of DWORD or QWORD type,
-        depending on VecESize.  */
-      if (i.broadcast->type != t->opcode_modifier.broadcast
-         || !i.types[i.broadcast->operand].bitfield.mem
-         || (t->opcode_modifier.vecesize == 0
-             && !i.types[i.broadcast->operand].bitfield.dword
-             && !i.types[i.broadcast->operand].bitfield.unspecified)
-         || (t->opcode_modifier.vecesize == 1
-             && !i.types[i.broadcast->operand].bitfield.qword
-             && !i.types[i.broadcast->operand].bitfield.unspecified))
-       goto bad_broadcast;
-
-      broadcasted_opnd_size = t->opcode_modifier.vecesize ? 64 : 32;
-      if (i.broadcast->type == BROADCAST_1TO16)
-       broadcasted_opnd_size <<= 4; /* Broadcast 1to16.  */
-      else if (i.broadcast->type == BROADCAST_1TO8)
-       broadcasted_opnd_size <<= 3; /* Broadcast 1to8.  */
-      else if (i.broadcast->type == BROADCAST_1TO4)
-       broadcasted_opnd_size <<= 2; /* Broadcast 1to4.  */
-      else if (i.broadcast->type == BROADCAST_1TO2)
-       broadcasted_opnd_size <<= 1; /* Broadcast 1to2.  */
-      else
-       goto bad_broadcast;
-
-      if ((broadcasted_opnd_size == 256
-          && !t->operand_types[i.broadcast->operand].bitfield.ymmword)
-         || (broadcasted_opnd_size == 512
-             && !t->operand_types[i.broadcast->operand].bitfield.zmmword))
+        and it's applied to memory operand of DWORD or QWORD type.  */
+      op = i.broadcast->operand;
+      if (!t->opcode_modifier.broadcast
+         || !i.types[op].bitfield.mem
+         || (!i.types[op].bitfield.unspecified
+             && (t->operand_types[op].bitfield.dword
+                 ? !i.types[op].bitfield.dword
+                 : !i.types[op].bitfield.qword)))
         {
         bad_broadcast:
           i.error = unsupported_broadcast;
           return 1;
         }
+
+      operand_type_set (&type, 0);
+      switch ((t->operand_types[op].bitfield.dword ? 4 : 8) * i.broadcast->type)
+       {
+       case 8:
+         type.bitfield.qword = 1;
+         break;
+       case 16:
+         type.bitfield.xmmword = 1;
+         break;
+       case 32:
+         type.bitfield.ymmword = 1;
+         break;
+       case 64:
+         type.bitfield.zmmword = 1;
+         break;
+       default:
+         goto bad_broadcast;
+       }
+
+      overlap = operand_type_and (type, t->operand_types[op]);
+      if (operand_type_all_zero (&overlap))
+         goto bad_broadcast;
+
+      if (t->opcode_modifier.checkregsize)
+       {
+         unsigned int j;
+
+         type.bitfield.baseindex = 1;
+         for (j = 0; j < i.operands; ++j)
+           {
+             if (j != op
+                 && !operand_type_register_match(i.types[j],
+                                                 t->operand_types[j],
+                                                 type,
+                                                 t->operand_types[op]))
+               goto bad_broadcast;
+           }
+       }
      }
    /* If broadcast is supported in this instruction, we need to check if
       operand of one-element size isn't specified without broadcast.  */
@@ -5058,15 +5190,16 @@ check_VecOperands (const insn_template *t)
           break;
        gas_assert (op < i.operands);
        /* Check size of the memory operand.  */
-      if ((t->opcode_modifier.vecesize == 0
-          && i.types[op].bitfield.dword)
-         || (t->opcode_modifier.vecesize == 1
-             && i.types[op].bitfield.qword))
+      if (t->operand_types[op].bitfield.dword
+         ? i.types[op].bitfield.dword
+         : i.types[op].bitfield.qword)
         {
           i.error = broadcast_needed;
           return 1;
         }
      }
+  else
+    op = MAX_OPERANDS - 1; /* Avoid uninitialized variable warning.  */
  
    /* Check if requested masking is supported.  */
    if (i.mask
@@ -5113,9 +5246,48 @@ check_VecOperands (const insn_template *t)
        && i.disp_encoding != disp_encoding_32bit)
      {
        if (i.broadcast)
-       i.memshift = t->opcode_modifier.vecesize ? 3 : 2;
-      else
+       i.memshift = t->operand_types[op].bitfield.dword ? 2 : 3;
+      else if (t->opcode_modifier.disp8memshift != DISP8_SHIFT_VL)
         i.memshift = t->opcode_modifier.disp8memshift;
+      else
+       {
+         const i386_operand_type *type = NULL;
+
+         i.memshift = 0;
+         for (op = 0; op < i.operands; op++)
+           if (operand_type_check (i.types[op], anymem))
+             {
+               if (t->operand_types[op].bitfield.xmmword
+                   + t->operand_types[op].bitfield.ymmword
+                   + t->operand_types[op].bitfield.zmmword <= 1)
+                 type = &t->operand_types[op];
+               else if (!i.types[op].bitfield.unspecified)
+                 type = &i.types[op];
+             }
+           else if (i.types[op].bitfield.regsimd)
+             {
+               if (i.types[op].bitfield.zmmword)
+                 i.memshift = 6;
+               else if (i.types[op].bitfield.ymmword && i.memshift < 5)
+                 i.memshift = 5;
+               else if (i.types[op].bitfield.xmmword && i.memshift < 4)
+                 i.memshift = 4;
+             }
+
+         if (type)
+           {
+             if (type->bitfield.zmmword)
+               i.memshift = 6;
+             else if (type->bitfield.ymmword)
+               i.memshift = 5;
+             else if (type->bitfield.xmmword)
+               i.memshift = 4;
+           }
+
+         /* For the check in fits_in_disp8().  */
+         if (i.memshift == 0)
+           i.memshift = -1;
+       }
  
        for (op = 0; op < i.operands; op++)
         if (operand_type_check (i.types[op], disp)
@@ -5144,7 +5316,7 @@ VEX_check_operands (const insn_template *t)
    if (i.vec_encoding == vex_encoding_evex)
      {
        /* This instruction must be encoded with EVEX prefix.  */
-      if (!t->opcode_modifier.evex)
+      if (!is_evex_encoding (t))
         {
           i.error = unsupported;
           return 1;
@@ -5192,7 +5364,7 @@ match_template (char mnem_suffix)
    i386_operand_type operand_types [MAX_OPERANDS];
    int addr_prefix_disp;
    unsigned int j;
-  unsigned int found_cpu_match;
+  unsigned int found_cpu_match, size_match;
    unsigned int check_register;
    enum i386_error specific_error = 0;
  
@@ -5204,7 +5376,9 @@ match_template (char mnem_suffix)
    addr_prefix_disp = -1;
  
    memset (&suffix_check, 0, sizeof (suffix_check));
-  if (i.suffix == BYTE_MNEM_SUFFIX)
+  if (intel_syntax && i.broadcast)
+    /* nothing */;
+  else if (i.suffix == BYTE_MNEM_SUFFIX)
      suffix_check.no_bsuf = 1;
    else if (i.suffix == WORD_MNEM_SUFFIX)
      suffix_check.no_wsuf = 1;
@@ -5247,11 +5421,6 @@ match_template (char mnem_suffix)
        if (!found_cpu_match)
         continue;
  
-      /* Check old gcc support. */
-      i.error = old_gcc_only;
-      if (!old_gcc && t->opcode_modifier.oldgcc)
-       continue;
-
        /* Check AT&T mnemonic.   */
        i.error = unsupported_with_intel_mnemonic;
        if (intel_mnemonic && t->opcode_modifier.attmnemonic)
@@ -5284,7 +5453,8 @@ match_template (char mnem_suffix)
           || (t->opcode_modifier.no_ldsuf && mnemsuf_check.no_ldsuf))
         continue;
  
-      if (!operand_size_match (t))
+      size_match = operand_size_match (t);
+      if (!size_match)
         continue;
  
        for (j = 0; j < MAX_OPERANDS; j++)
@@ -5295,6 +5465,7 @@ match_template (char mnem_suffix)
           && flag_code != CODE_64BIT
           && (intel_syntax
               ? (!t->opcode_modifier.ignoresize
+                && !t->opcode_modifier.broadcast
                  && !intel_float_operand (t->name))
               : intel_float_operand (t->name) != 2)
           && ((!operand_types[0].bitfield.regmmx
@@ -5377,7 +5548,15 @@ match_template (char mnem_suffix)
         continue;
  
        /* We check register size if needed.  */
-      check_register = t->opcode_modifier.checkregsize;
+      if (t->opcode_modifier.checkregsize)
+       {
+         check_register = (1 << t->operands) - 1;
+         if (i.broadcast)
+           check_register &= ~(1 << i.broadcast->operand);
+       }
+      else
+       check_register = 0;
+
        overlap0 = operand_type_and (i.types[0], operand_types[0]);
        switch (t->operands)
         {
@@ -5395,6 +5574,16 @@ match_template (char mnem_suffix)
               && operand_type_equal (&i.types [0], &acc32)
               && operand_type_equal (&i.types [1], &acc32))
             continue;
+         /* xrelease mov %eax, <disp> is another special case. It must not
+            match the accumulator-only encoding of mov.  */
+         if (flag_code != CODE_64BIT
+             && i.hle_prefix
+             && t->base_opcode == 0xa0
+             && i.types[0].bitfield.acc
+             && operand_type_check (i.types[1], anymem))
+           continue;
+         if (!(size_match & MATCH_STRAIGHT))
+           goto check_reverse;
           /* If we want store form, we reverse direction of operands.  */
           if (i.dir_encoding == dir_encoding_store
               && t->opcode_modifier.d)
@@ -5413,7 +5602,7 @@ match_template (char mnem_suffix)
           overlap1 = operand_type_and (i.types[1], operand_types[1]);
           if (!operand_type_match (overlap0, i.types[0])
               || !operand_type_match (overlap1, i.types[1])
-             || (check_register
+             || ((check_register & 3) == 3
                   && !operand_type_register_match (i.types[0],
                                                    operand_types[0],
                                                    i.types[1],
@@ -5424,6 +5613,8 @@ match_template (char mnem_suffix)
                 continue;
  
  check_reverse:
+             if (!(size_match & MATCH_REVERSE))
+               continue;
               /* Try reversing direction of operands.  */
               overlap0 = operand_type_and (i.types[0], operand_types[1]);
               overlap1 = operand_type_and (i.types[1], operand_types[0]);
@@ -5480,26 +5671,32 @@ check_reverse:
                   /* Fall through.  */
                 case 4:
                   if (!operand_type_match (overlap3, i.types[3])
-                     || (check_register
+                     || ((check_register & 0xa) == 0xa
+                         && !operand_type_register_match (i.types[1],
+                                                           operand_types[1],
+                                                           i.types[3],
+                                                           operand_types[3]))
+                     || ((check_register & 0xc) == 0xc
                           && !operand_type_register_match (i.types[2],
-                                                          operand_types[2],
-                                                          i.types[3],
-                                                          operand_types[3])))
+                                                           operand_types[2],
+                                                           i.types[3],
+                                                           operand_types[3])))
                     continue;
                   /* Fall through.  */
                 case 3:
                   /* Here we make use of the fact that there are no
                      reverse match 3 operand instructions.  */
                   if (!operand_type_match (overlap2, i.types[2])
-                     || (check_register
-                         && (!operand_type_register_match (i.types[0],
+                     || ((check_register & 5) == 5
+                         && !operand_type_register_match (i.types[0],
                                                             operand_types[0],
                                                             i.types[2],
-                                                           operand_types[2])
-                             || !operand_type_register_match (i.types[1],
-                                                              operand_types[1],
-                                                              i.types[2],
-                                                              operand_types[2]))))
+                                                           operand_types[2]))
+                     || ((check_register & 6) == 6
+                         && !operand_type_register_match (i.types[1],
+                                                           operand_types[1],
+                                                           i.types[2],
+                                                           operand_types[2])))
                     continue;
                   break;
                 }
@@ -5550,9 +5747,6 @@ check_reverse:
         case bad_imm4:
           err_msg = _("constant doesn't fit in 4 bits");
           break;
-       case old_gcc_only:
-         err_msg = _("only supported with old gcc");
-         break;
         case unsupported_with_intel_mnemonic:
           err_msg = _("unsupported with Intel mnemonic");
           break;
@@ -5764,7 +5958,9 @@ process_suffix (void)
         {
           if (intel_syntax
               && i.tm.opcode_modifier.ignoresize
-             && i.tm.opcode_modifier.no_lsuf)
+             && i.tm.opcode_modifier.no_lsuf
+             && !i.tm.opcode_modifier.todword
+             && !i.tm.opcode_modifier.toqword)
             i.suffix = 0;
           else if (!check_long_reg ())
             return 0;
@@ -5773,7 +5969,9 @@ process_suffix (void)
         {
           if (intel_syntax
               && i.tm.opcode_modifier.ignoresize
-             && i.tm.opcode_modifier.no_qsuf)
+             && i.tm.opcode_modifier.no_qsuf
+             && !i.tm.opcode_modifier.todword
+             && !i.tm.opcode_modifier.toqword)
             i.suffix = 0;
           else if (!check_qword_reg ())
             return 0;
@@ -5893,14 +6091,18 @@ process_suffix (void)
        /* Now select between word & dword operations via the operand
          size prefix, except for instructions that will ignore this
          prefix anyway.  */
-      if (i.tm.opcode_modifier.addrprefixop0)
+      if (i.reg_operands > 0
+         && i.types[0].bitfield.reg
+         && i.tm.opcode_modifier.addrprefixopreg
+         && (i.tm.opcode_modifier.immext
+             || i.operands == 1))
         {
           /* The address size override prefix changes the size of the
              first operand.  */
           if ((flag_code == CODE_32BIT
-              && i.op->regs[0].reg_type.bitfield.word)
+              && i.op[0].regs->reg_type.bitfield.word)
               || (flag_code != CODE_32BIT
-                 && i.op->regs[0].reg_type.bitfield.dword))
+                 && i.op[0].regs->reg_type.bitfield.dword))
             if (!add_prefix (ADDR_PREFIX_OPCODE))
               return 0;
         }
@@ -5936,6 +6138,41 @@ process_suffix (void)
        break;
      }
  
+  if (i.reg_operands != 0
+      && i.operands > 1
+      && i.tm.opcode_modifier.addrprefixopreg
+      && !i.tm.opcode_modifier.immext)
+    {
+      /* Check invalid register operand when the address size override
+        prefix changes the size of register operands.  */
+      unsigned int op;
+      enum { need_word, need_dword, need_qword } need;
+
+      if (flag_code == CODE_32BIT)
+       need = i.prefix[ADDR_PREFIX] ? need_word : need_dword;
+      else
+       {
+         if (i.prefix[ADDR_PREFIX])
+           need = need_dword;
+         else
+           need = flag_code == CODE_64BIT ? need_qword : need_word;
+       }
+
+      for (op = 0; op < i.operands; op++)
+       if (i.types[op].bitfield.reg
+           && ((need == need_word
+                && !i.op[op].regs->reg_type.bitfield.word)
+               || (need == need_dword
+                   && !i.op[op].regs->reg_type.bitfield.dword)
+               || (need == need_qword
+                   && !i.op[op].regs->reg_type.bitfield.qword)))
+         {
+           as_bad (_("invalid register operand size for `%s'"),
+                   i.tm.name);
+           return 0;
+         }
+    }
+
    return 1;
  }
  
@@ -6500,118 +6737,82 @@ build_modrm_byte (void)
    unsigned int source, dest;
    int vex_3_sources;
  
-  /* The first operand of instructions with VEX prefix and 3 sources
-     must be VEX_Imm4.  */
    vex_3_sources = i.tm.opcode_modifier.vexsources == VEX3SOURCES;
    if (vex_3_sources)
      {
        unsigned int nds, reg_slot;
        expressionS *exp;
  
-      if (i.tm.opcode_modifier.veximmext
-          && i.tm.opcode_modifier.immext)
-        {
-          dest = i.operands - 2;
-          gas_assert (dest == 3);
-        }
-      else
-        dest = i.operands - 1;
+      dest = i.operands - 1;
        nds = dest - 1;
  
        /* There are 2 kinds of instructions:
-         1. 5 operands: 4 register operands or 3 register operands
-         plus 1 memory operand plus one Vec_Imm4 operand, VexXDS, and
-         VexW0 or VexW1.  The destination must be either XMM, YMM or
+        1. 5 operands: 4 register operands or 3 register operands
+        plus 1 memory operand plus one Vec_Imm4 operand, VexXDS, and
+        VexW0 or VexW1.  The destination must be either XMM, YMM or
          ZMM register.
-         2. 4 operands: 4 register operands or 3 register operands
-         plus 1 memory operand, VexXDS, and VexImmExt  */
+        2. 4 operands: 4 register operands or 3 register operands
+        plus 1 memory operand, with VexXDS.  */
        gas_assert ((i.reg_operands == 4
-                   || (i.reg_operands == 3 && i.mem_operands == 1))
-                  && i.tm.opcode_modifier.vexvvvv == VEXXDS
-                  && (i.tm.opcode_modifier.veximmext
-                      || (i.imm_operands == 1
-                          && i.types[0].bitfield.vec_imm4
-                          && (i.tm.opcode_modifier.vexw == VEXW0
-                              || i.tm.opcode_modifier.vexw == VEXW1)
-                          && i.tm.operand_types[dest].bitfield.regsimd)));
+                  || (i.reg_operands == 3 && i.mem_operands == 1))
+                 && i.tm.opcode_modifier.vexvvvv == VEXXDS
+                 && i.tm.opcode_modifier.vexw
+                 && i.tm.operand_types[dest].bitfield.regsimd);
+
+      /* If VexW1 is set, the first non-immediate operand is the source and
+        the second non-immediate one is encoded in the immediate operand.  */
+      if (i.tm.opcode_modifier.vexw == VEXW1)
+       {
+         source = i.imm_operands;
+         reg_slot = i.imm_operands + 1;
+       }
+      else
+       {
+         source = i.imm_operands + 1;
+         reg_slot = i.imm_operands;
+       }
  
        if (i.imm_operands == 0)
-        {
-          /* When there is no immediate operand, generate an 8bit
-             immediate operand to encode the first operand.  */
-          exp = &im_expressions[i.imm_operands++];
-          i.op[i.operands].imms = exp;
-          i.types[i.operands] = imm8;
-          i.operands++;
-          /* If VexW1 is set, the first operand is the source and
-             the second operand is encoded in the immediate operand.  */
-          if (i.tm.opcode_modifier.vexw == VEXW1)
-            {
-              source = 0;
-              reg_slot = 1;
-            }
-          else
-            {
-              source = 1;
-              reg_slot = 0;
-            }
-
-          /* FMA swaps REG and NDS.  */
-          if (i.tm.cpu_flags.bitfield.cpufma)
-            {
-              unsigned int tmp;
-              tmp = reg_slot;
-              reg_slot = nds;
-              nds = tmp;
-            }
-
-          gas_assert (i.tm.operand_types[reg_slot].bitfield.regsimd);
-          exp->X_op = O_constant;
-          exp->X_add_number = register_number (i.op[reg_slot].regs) << 4;
+       {
+         /* When there is no immediate operand, generate an 8bit
+            immediate operand to encode the first operand.  */
+         exp = &im_expressions[i.imm_operands++];
+         i.op[i.operands].imms = exp;
+         i.types[i.operands] = imm8;
+         i.operands++;
+
+         gas_assert (i.tm.operand_types[reg_slot].bitfield.regsimd);
+         exp->X_op = O_constant;
+         exp->X_add_number = register_number (i.op[reg_slot].regs) << 4;
           gas_assert ((i.op[reg_slot].regs->reg_flags & RegVRex) == 0);
         }
        else
-        {
-          unsigned int imm_slot;
-
-          if (i.tm.opcode_modifier.vexw == VEXW0)
-            {
-              /* If VexW0 is set, the third operand is the source and
-                 the second operand is encoded in the immediate
-                 operand.  */
-              source = 2;
-              reg_slot = 1;
-            }
-          else
-            {
-              /* VexW1 is set, the second operand is the source and
-                 the third operand is encoded in the immediate
-                 operand.  */
-              source = 1;
-              reg_slot = 2;
-            }
-
-          if (i.tm.opcode_modifier.immext)
-            {
-              /* When ImmExt is set, the immediate byte is the last
-                 operand.  */
-              imm_slot = i.operands - 1;
-              source--;
-              reg_slot--;
-            }
-          else
-            {
-              imm_slot = 0;
-
-              /* Turn on Imm8 so that output_imm will generate it.  */
-              i.types[imm_slot].bitfield.imm8 = 1;
-            }
-
-          gas_assert (i.tm.operand_types[reg_slot].bitfield.regsimd);
-          i.op[imm_slot].imms->X_add_number
-              |= register_number (i.op[reg_slot].regs) << 4;
+       {
+         unsigned int imm_slot;
+
+         gas_assert (i.imm_operands == 1 && i.types[0].bitfield.vec_imm4);
+
+         if (i.tm.opcode_modifier.immext)
+           {
+             /* When ImmExt is set, the immediate byte is the last
+                operand.  */
+             imm_slot = i.operands - 1;
+             source--;
+             reg_slot--;
+           }
+         else
+           {
+             imm_slot = 0;
+
+             /* Turn on Imm8 so that output_imm will generate it.  */
+             i.types[imm_slot].bitfield.imm8 = 1;
+           }
+
+         gas_assert (i.tm.operand_types[reg_slot].bitfield.regsimd);
+         i.op[imm_slot].imms->X_add_number
+             |= register_number (i.op[reg_slot].regs) << 4;
           gas_assert ((i.op[reg_slot].regs->reg_flags & RegVRex) == 0);
-        }
+       }
  
        gas_assert (i.tm.operand_types[nds].bitfield.regsimd);
        i.vex.register_specifier = i.op[nds].regs;
@@ -6679,7 +6880,7 @@ build_modrm_byte (void)
             }
           break;
         case 5:
-         if (i.tm.opcode_modifier.evex)
+         if (is_evex_encoding (&i.tm))
             {
               /* For EVEX instructions, when there are 5 operands, the
                  first one must be immediate operand.  If the second one
@@ -6779,12 +6980,11 @@ build_modrm_byte (void)
           if ((i.op[source].regs->reg_flags & RegVRex) != 0)
             i.vrex |= REX_R;
         }
-      if (flag_code != CODE_64BIT && (i.rex & (REX_R | REX_B)))
+      if (flag_code != CODE_64BIT && (i.rex & REX_R))
         {
-         if (!i.types[0].bitfield.control
-             && !i.types[1].bitfield.control)
+         if (!i.types[i.tm.operand_types[0].bitfield.regmem].bitfield.control)
             abort ();
-         i.rex &= ~(REX_R | REX_B);
+         i.rex &= ~REX_R;
           add_prefix (LOCK_PREFIX_OPCODE);
         }
      }
@@ -7556,22 +7756,16 @@ output_insn (void)
               if (i.tm.base_opcode & 0xff000000)
                 {
                   prefix = (i.tm.base_opcode >> 24) & 0xff;
-                 goto check_prefix;
+                 add_prefix (prefix);
                 }
               break;
             case 2:
               if ((i.tm.base_opcode & 0xff0000) != 0)
                 {
                   prefix = (i.tm.base_opcode >> 16) & 0xff;
-                 if (i.tm.cpu_flags.bitfield.cpupadlock)
-                   {
-check_prefix:
-                     if (prefix != REPE_PREFIX_OPCODE
-                         || (i.prefix[REP_PREFIX]
-                             != REPE_PREFIX_OPCODE))
-                       add_prefix (prefix);
-                   }
-                 else
+                 if (!i.tm.cpu_flags.bitfield.cpupadlock
+                     || prefix != REPE_PREFIX_OPCODE
+                     || (i.prefix[REP_PREFIX] != REPE_PREFIX_OPCODE))
                     add_prefix (prefix);
                 }
               break;
@@ -8399,15 +8593,15 @@ check_VecOperations (char *op_string, char *op_end)
  
               op_string += 3;
               if (*op_string == '8')
-               bcst_type = BROADCAST_1TO8;
+               bcst_type = 8;
               else if (*op_string == '4')
-               bcst_type = BROADCAST_1TO4;
+               bcst_type = 4;
               else if (*op_string == '2')
-               bcst_type = BROADCAST_1TO2;
+               bcst_type = 2;
               else if (*op_string == '1'
                        && *(op_string+1) == '6')
                 {
-                 bcst_type = BROADCAST_1TO16;
+                 bcst_type = 16;
                   op_string++;
                 }
               else
@@ -8499,6 +8693,12 @@ check_VecOperations (char *op_string, char *op_end)
               return NULL;
             }
           op_string++;
+
+         /* Strip whitespace since the addition of pseudo prefixes
+            changed how the scrubber treats '{'.  */
+         if (is_space_char (*op_string))
+           ++op_string;
+
           continue;
         }
      unknown_vec_op:
@@ -9517,14 +9717,13 @@ i386_att_operand (char *operand_string)
  
        /* Special case for (%dx) while doing input/output op.  */
        if (i.base_reg
-         && operand_type_equal (&i.base_reg->reg_type,
-                                &reg16_inoutportreg)
+         && i.base_reg->reg_type.bitfield.inoutportreg
           && i.index_reg == 0
           && i.log2_scale_factor == 0
           && i.seg[i.mem_operands] == 0
           && !operand_type_check (i.types[this_operand], disp))
         {
-         i.types[this_operand] = inoutportreg;
+         i.types[this_operand] = i.base_reg->reg_type;
           return 1;
         }
  
@@ -10072,6 +10271,11 @@ parse_real_register (char *reg_string, char **end_op)
    /* Handle floating point regs, allowing spaces in the (i) part.  */
    if (r == i386_regtab /* %st is first entry of table  */)
      {
+      if (!cpu_arch_flags.bitfield.cpu8087
+         && !cpu_arch_flags.bitfield.cpu287
+         && !cpu_arch_flags.bitfield.cpu387)
+       return (const reg_entry *) NULL;
+
        if (is_space_char (*s))
         ++s;
        if (*s == '(')
@@ -10112,26 +10316,25 @@ parse_real_register (char *reg_string, char **end_op)
        && !cpu_arch_flags.bitfield.cpui386)
      return (const reg_entry *) NULL;
  
-  if (r->reg_type.bitfield.tbyte
-      && !cpu_arch_flags.bitfield.cpu8087
-      && !cpu_arch_flags.bitfield.cpu287
-      && !cpu_arch_flags.bitfield.cpu387)
+  if (r->reg_type.bitfield.regmmx && !cpu_arch_flags.bitfield.cpummx)
      return (const reg_entry *) NULL;
  
-  if (r->reg_type.bitfield.regmmx && !cpu_arch_flags.bitfield.cpuregmmx)
-    return (const reg_entry *) NULL;
-
-  if (r->reg_type.bitfield.xmmword && !cpu_arch_flags.bitfield.cpuregxmm)
-    return (const reg_entry *) NULL;
+  if (!cpu_arch_flags.bitfield.cpuavx512f)
+    {
+      if (r->reg_type.bitfield.zmmword || r->reg_type.bitfield.regmask)
+       return (const reg_entry *) NULL;
  
-  if (r->reg_type.bitfield.ymmword && !cpu_arch_flags.bitfield.cpuregymm)
-    return (const reg_entry *) NULL;
+      if (!cpu_arch_flags.bitfield.cpuavx)
+       {
+         if (r->reg_type.bitfield.ymmword)
+           return (const reg_entry *) NULL;
  
-  if (r->reg_type.bitfield.zmmword && !cpu_arch_flags.bitfield.cpuregzmm)
-    return (const reg_entry *) NULL;
+         if (!cpu_arch_flags.bitfield.cpusse && r->reg_type.bitfield.xmmword)
+           return (const reg_entry *) NULL;
+       }
+    }
  
-  if (r->reg_type.bitfield.regmask
-      && !cpu_arch_flags.bitfield.cpuregmask)
+  if (r->reg_type.bitfield.regbnd && !cpu_arch_flags.bitfield.cpumpx)
      return (const reg_entry *) NULL;
  
    /* Don't allow fake index register unless allow_index_reg isn't 0. */
@@ -10139,23 +10342,19 @@ parse_real_register (char *reg_string, char **end_op)
        && (r->reg_num == RegEiz || r->reg_num == RegRiz))
      return (const reg_entry *) NULL;
  
-  /* Upper 16 vector register is only available with VREX in 64bit
-     mode.  */
-  if ((r->reg_flags & RegVRex))
+  /* Upper 16 vector registers are only available with VREX in 64bit
+     mode, and require EVEX encoding.  */
+  if (r->reg_flags & RegVRex)
      {
-      if (i.vec_encoding == vex_encoding_default)
-       i.vec_encoding = vex_encoding_evex;
-
        if (!cpu_arch_flags.bitfield.cpuvrex
-         || i.vec_encoding != vex_encoding_evex
           || flag_code != CODE_64BIT)
         return (const reg_entry *) NULL;
+
+      i.vec_encoding = vex_encoding_evex;
      }
  
-  if (((r->reg_flags & (RegRex64 | RegRex))
-       || r->reg_type.bitfield.qword)
-      && (!cpu_arch_flags.bitfield.cpulm
-         || !operand_type_equal (&r->reg_type, &control))
+  if (((r->reg_flags & (RegRex64 | RegRex)) || r->reg_type.bitfield.qword)
+      && (!cpu_arch_flags.bitfield.cpulm || !r->reg_type.bitfield.control)
        && flag_code != CODE_64BIT)
      return (const reg_entry *) NULL;
  
@@ -10279,7 +10478,7 @@ const char *md_shortopts = "qnO::";
  #define OPTION_MSYNTAX (OPTION_MD_BASE + 6)
  #define OPTION_MINDEX_REG (OPTION_MD_BASE + 7)
  #define OPTION_MNAKED_REG (OPTION_MD_BASE + 8)
-#define OPTION_MOLD_GCC (OPTION_MD_BASE + 9)
+#define OPTION_MRELAX_RELOCATIONS (OPTION_MD_BASE + 9)
  #define OPTION_MSSE2AVX (OPTION_MD_BASE + 10)
  #define OPTION_MSSE_CHECK (OPTION_MD_BASE + 11)
  #define OPTION_MOPERAND_CHECK (OPTION_MD_BASE + 12)
@@ -10295,7 +10494,6 @@ const char *md_shortopts = "qnO::";
  #define OPTION_MAMD64 (OPTION_MD_BASE + 22)
  #define OPTION_MINTEL64 (OPTION_MD_BASE + 23)
  #define OPTION_MFENCE_AS_LOCK_ADD (OPTION_MD_BASE + 24)
-#define OPTION_MRELAX_RELOCATIONS (OPTION_MD_BASE + 25)
  
  struct option md_longopts[] =
  {
@@ -10315,7 +10513,6 @@ struct option md_longopts[] =
    {"msyntax", required_argument, NULL, OPTION_MSYNTAX},
    {"mindex-reg", no_argument, NULL, OPTION_MINDEX_REG},
    {"mnaked-reg", no_argument, NULL, OPTION_MNAKED_REG},
-  {"mold-gcc", no_argument, NULL, OPTION_MOLD_GCC},
    {"msse2avx", no_argument, NULL, OPTION_MSSE2AVX},
    {"msse-check", required_argument, NULL, OPTION_MSSE_CHECK},
    {"moperand-check", required_argument, NULL, OPTION_MOPERAND_CHECK},
@@ -10500,6 +10697,10 @@ md_parse_option (int c, const char *arg)
                       cpu_arch_flags = flags;
                       cpu_arch_isa_flags = flags;
                     }
+                 else
+                   cpu_arch_isa_flags
+                     = cpu_flags_or (cpu_arch_isa_flags,
+                                     cpu_arch[j].flags);
                   break;
                 }
             }
@@ -10587,10 +10788,6 @@ md_parse_option (int c, const char *arg)
        allow_naked_reg = 1;
        break;
  
-    case OPTION_MOLD_GCC:
-      old_gcc = 1;
-      break;
-
      case OPTION_MSSE2AVX:
        sse2avx = 1;
        break;
@@ -10901,8 +11098,6 @@ md_show_usage (FILE *stream)
    fprintf (stream, _("\
    -mnaked-reg             don't require `%%' prefix for registers\n"));
    fprintf (stream, _("\
-  -mold-gcc               support old (<= 2.8.1) versions of gcc\n"));
-  fprintf (stream, _("\
    -madd-bnd-prefix        add BND prefix for all valid branches\n"));
    fprintf (stream, _("\
    -mshared                disable branch optimization for shared code\n"));