Remove netware support

[deliverable/binutils-gdb.git] / gas / config / tc-i386.c
diff --git a/gas/config/tc-i386.c b/gas/config/tc-i386.c

index cd69321bdb25f698f7764e5648d10253472724a3..9d5293d23b42562db0227226dcb3cf3cc421d4a0 100644 (file)
--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -81,9 +81,6 @@
  #define SHORT_MNEM_SUFFIX 's'
  #define LONG_MNEM_SUFFIX  'l'
  #define QWORD_MNEM_SUFFIX  'q'
-#define XMMWORD_MNEM_SUFFIX  'x'
-#define YMMWORD_MNEM_SUFFIX 'y'
-#define ZMMWORD_MNEM_SUFFIX 'z'
  /* Intel Syntax.  Use a non-ascii letter since since it never appears
     in instructions.  */
  #define LONG_DOUBLE_MNEM_SUFFIX '\1'
@@ -228,7 +225,7 @@ static struct Mask_Operation mask_op;
     broadcast factor.  */
  struct Broadcast_Operation
  {
-  /* Type of broadcast: no broadcast, {1to8}, or {1to16}.  */
+  /* Type of broadcast: {1to2}, {1to4}, {1to8}, or {1to16}.  */
    int type;
  
    /* Index of broadcasted operand.  */
@@ -265,7 +262,6 @@ enum i386_error
      number_of_operands_mismatch,
      invalid_instruction_suffix,
      bad_imm4,
-    old_gcc_only,
      unsupported_with_intel_mnemonic,
      unsupported_syntax,
      unsupported,
@@ -369,6 +365,12 @@ struct _i386_insn
         disp_encoding_32bit
        } disp_encoding;
  
+    /* Prefer the REX byte in encoding.  */
+    bfd_boolean rex_encoding;
+
+    /* Disable instruction size optimization.  */
+    bfd_boolean no_optimize;
+
      /* How to encode vector instructions.  */
      enum
        {
@@ -430,7 +432,6 @@ const char extra_symbol_chars[] = "*%-([{}"
          && !defined (TE_GNU)                           \
          && !defined (TE_LINUX)                         \
          && !defined (TE_NACL)                          \
-        && !defined (TE_NETWARE)                       \
          && !defined (TE_FreeBSD)                       \
          && !defined (TE_DragonFly)                     \
          && !defined (TE_NetBSD)))
@@ -559,9 +560,6 @@ static int intel64;
     0 if att mnemonic.  */
  static int intel_mnemonic = !SYSV386_COMPAT;
  
-/* 1 if support old (<= 2.8.1) versions of gcc.  */
-static int old_gcc = OLDGCC_COMPAT;
-
  /* 1 if pseudo registers are permitted.  */
  static int allow_pseudo_reg = 0;
  
@@ -597,6 +595,22 @@ static enum check_kind
    }
  sse_check, operand_check = check_warning;
  
+/* Optimization:
+   1. Clear the REX_W bit with register operand if possible.
+   2. Above plus use 128bit vector instruction to clear the full vector
+      register.
+ */
+static int optimize = 0;
+
+/* Optimization:
+   1. Clear the REX_W bit with register operand if possible.
+   2. Above plus use 128bit vector instruction to clear the full vector
+      register.
+   3. Above plus optimize "test{q,l,w} $imm8,%r{64,32,16}" to
+      "testb $imm7,%r8".
+ */
+static int optimize_for_space = 0;
+
  /* Register prefix used for error message.  */
  static const char *register_prefix = "%";
  
@@ -1007,6 +1021,12 @@ static const arch_entry cpu_arch[] =
      CPU_VAES_FLAGS, 0 },
    { STRING_COMMA_LEN (".vpclmulqdq"), PROCESSOR_UNKNOWN,
      CPU_VPCLMULQDQ_FLAGS, 0 },
+  { STRING_COMMA_LEN (".wbnoinvd"), PROCESSOR_UNKNOWN,
+    CPU_WBNOINVD_FLAGS, 0 },
+  { STRING_COMMA_LEN (".pconfig"), PROCESSOR_UNKNOWN,
+    CPU_PCONFIG_FLAGS, 0 },
+  { STRING_COMMA_LEN (".waitpkg"), PROCESSOR_UNKNOWN,
+    CPU_WAITPKG_FLAGS, 0 },
  };
  
  static const noarch_entry cpu_noarch[] =
@@ -1123,7 +1143,7 @@ const pseudo_typeS md_pseudo_table[] =
  #if defined (OBJ_ELF) || defined (OBJ_MAYBE_ELF)
    {"largecomm", handle_large_common, 0},
  #else
-  {"file", (void (*) (int)) dwarf2_directive_file, 0},
+  {"file", dwarf2_directive_file, 0},
    {"loc", dwarf2_directive_loc, 0},
    {"loc_mark_labels", dwarf2_directive_loc_mark_labels, 0},
  #endif
@@ -1142,108 +1162,146 @@ static struct hash_control *op_hash;
  /* Hash table for register lookup.  */
  static struct hash_control *reg_hash;
  \f
-void
-i386_align_code (fragS *fragP, int count)
-{
    /* Various efficient no-op patterns for aligning code labels.
       Note: Don't try to assemble the instructions in the comments.
       0L and 0w are not legal.  */
-  static const unsigned char f32_1[] =
-    {0x90};                                    /* nop                  */
-  static const unsigned char f32_2[] =
-    {0x66,0x90};                               /* xchg %ax,%ax */
-  static const unsigned char f32_3[] =
-    {0x8d,0x76,0x00};                          /* leal 0(%esi),%esi    */
-  static const unsigned char f32_4[] =
-    {0x8d,0x74,0x26,0x00};                     /* leal 0(%esi,1),%esi  */
-  static const unsigned char f32_5[] =
-    {0x90,                                     /* nop                  */
-     0x8d,0x74,0x26,0x00};                     /* leal 0(%esi,1),%esi  */
-  static const unsigned char f32_6[] =
-    {0x8d,0xb6,0x00,0x00,0x00,0x00};           /* leal 0L(%esi),%esi   */
-  static const unsigned char f32_7[] =
-    {0x8d,0xb4,0x26,0x00,0x00,0x00,0x00};      /* leal 0L(%esi,1),%esi */
-  static const unsigned char f32_8[] =
-    {0x90,                                     /* nop                  */
-     0x8d,0xb4,0x26,0x00,0x00,0x00,0x00};      /* leal 0L(%esi,1),%esi */
-  static const unsigned char f32_9[] =
-    {0x89,0xf6,                                        /* movl %esi,%esi       */
-     0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};      /* leal 0L(%edi,1),%edi */
-  static const unsigned char f32_10[] =
-    {0x8d,0x76,0x00,                           /* leal 0(%esi),%esi    */
-     0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};      /* leal 0L(%edi,1),%edi */
-  static const unsigned char f32_11[] =
-    {0x8d,0x74,0x26,0x00,                      /* leal 0(%esi,1),%esi  */
-     0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};      /* leal 0L(%edi,1),%edi */
-  static const unsigned char f32_12[] =
-    {0x8d,0xb6,0x00,0x00,0x00,0x00,            /* leal 0L(%esi),%esi   */
-     0x8d,0xbf,0x00,0x00,0x00,0x00};           /* leal 0L(%edi),%edi   */
-  static const unsigned char f32_13[] =
-    {0x8d,0xb6,0x00,0x00,0x00,0x00,            /* leal 0L(%esi),%esi   */
-     0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};      /* leal 0L(%edi,1),%edi */
-  static const unsigned char f32_14[] =
-    {0x8d,0xb4,0x26,0x00,0x00,0x00,0x00,       /* leal 0L(%esi,1),%esi */
-     0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};      /* leal 0L(%edi,1),%edi */
-  static const unsigned char f16_3[] =
-    {0x8d,0x74,0x00};                          /* lea 0(%esi),%esi     */
-  static const unsigned char f16_4[] =
-    {0x8d,0xb4,0x00,0x00};                     /* lea 0w(%si),%si      */
-  static const unsigned char f16_5[] =
-    {0x90,                                     /* nop                  */
-     0x8d,0xb4,0x00,0x00};                     /* lea 0w(%si),%si      */
-  static const unsigned char f16_6[] =
-    {0x89,0xf6,                                        /* mov %si,%si          */
-     0x8d,0xbd,0x00,0x00};                     /* lea 0w(%di),%di      */
-  static const unsigned char f16_7[] =
-    {0x8d,0x74,0x00,                           /* lea 0(%si),%si       */
-     0x8d,0xbd,0x00,0x00};                     /* lea 0w(%di),%di      */
-  static const unsigned char f16_8[] =
-    {0x8d,0xb4,0x00,0x00,                      /* lea 0w(%si),%si      */
-     0x8d,0xbd,0x00,0x00};                     /* lea 0w(%di),%di      */
-  static const unsigned char jump_31[] =
-    {0xeb,0x1d,0x90,0x90,0x90,0x90,0x90,       /* jmp .+31; lotsa nops */
-     0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,
-     0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,
-     0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90};
-  static const unsigned char *const f32_patt[] = {
-    f32_1, f32_2, f32_3, f32_4, f32_5, f32_6, f32_7, f32_8,
-    f32_9, f32_10, f32_11, f32_12, f32_13, f32_14
-  };
-  static const unsigned char *const f16_patt[] = {
-    f32_1, f32_2, f16_3, f16_4, f16_5, f16_6, f16_7, f16_8
-  };
-  /* nopl (%[re]ax) */
-  static const unsigned char alt_3[] =
-    {0x0f,0x1f,0x00};
-  /* nopl 0(%[re]ax) */
-  static const unsigned char alt_4[] =
-    {0x0f,0x1f,0x40,0x00};
-  /* nopl 0(%[re]ax,%[re]ax,1) */
-  static const unsigned char alt_5[] =
-    {0x0f,0x1f,0x44,0x00,0x00};
-  /* nopw 0(%[re]ax,%[re]ax,1) */
-  static const unsigned char alt_6[] =
-    {0x66,0x0f,0x1f,0x44,0x00,0x00};
-  /* nopl 0L(%[re]ax) */
-  static const unsigned char alt_7[] =
-    {0x0f,0x1f,0x80,0x00,0x00,0x00,0x00};
-  /* nopl 0L(%[re]ax,%[re]ax,1) */
-  static const unsigned char alt_8[] =
-    {0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00};
-  /* nopw 0L(%[re]ax,%[re]ax,1) */
-  static const unsigned char alt_9[] =
-    {0x66,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00};
-  /* nopw %cs:0L(%[re]ax,%[re]ax,1) */
-  static const unsigned char alt_10[] =
-    {0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00};
-  static const unsigned char *const alt_patt[] = {
-    f32_1, f32_2, alt_3, alt_4, alt_5, alt_6, alt_7, alt_8,
-    alt_9, alt_10
-  };
+static const unsigned char f32_1[] =
+  {0x90};                              /* nop                  */
+static const unsigned char f32_2[] =
+  {0x66,0x90};                         /* xchg %ax,%ax         */
+static const unsigned char f32_3[] =
+  {0x8d,0x76,0x00};                    /* leal 0(%esi),%esi    */
+static const unsigned char f32_4[] =
+  {0x8d,0x74,0x26,0x00};               /* leal 0(%esi,1),%esi  */
+static const unsigned char f32_6[] =
+  {0x8d,0xb6,0x00,0x00,0x00,0x00};     /* leal 0L(%esi),%esi   */
+static const unsigned char f32_7[] =
+  {0x8d,0xb4,0x26,0x00,0x00,0x00,0x00};        /* leal 0L(%esi,1),%esi */
+static const unsigned char f16_3[] =
+  {0x8d,0x74,0x00};                    /* lea 0(%si),%si       */
+static const unsigned char f16_4[] =
+  {0x8d,0xb4,0x00,0x00};               /* lea 0W(%si),%si      */
+static const unsigned char jump_disp8[] =
+  {0xeb};                              /* jmp disp8           */
+static const unsigned char jump32_disp32[] =
+  {0xe9};                              /* jmp disp32          */
+static const unsigned char jump16_disp32[] =
+  {0x66,0xe9};                         /* jmp disp32          */
+/* 32-bit NOPs patterns.  */
+static const unsigned char *const f32_patt[] = {
+  f32_1, f32_2, f32_3, f32_4, NULL, f32_6, f32_7
+};
+/* 16-bit NOPs patterns.  */
+static const unsigned char *const f16_patt[] = {
+  f32_1, f32_2, f16_3, f16_4
+};
+/* nopl (%[re]ax) */
+static const unsigned char alt_3[] =
+  {0x0f,0x1f,0x00};
+/* nopl 0(%[re]ax) */
+static const unsigned char alt_4[] =
+  {0x0f,0x1f,0x40,0x00};
+/* nopl 0(%[re]ax,%[re]ax,1) */
+static const unsigned char alt_5[] =
+  {0x0f,0x1f,0x44,0x00,0x00};
+/* nopw 0(%[re]ax,%[re]ax,1) */
+static const unsigned char alt_6[] =
+  {0x66,0x0f,0x1f,0x44,0x00,0x00};
+/* nopl 0L(%[re]ax) */
+static const unsigned char alt_7[] =
+  {0x0f,0x1f,0x80,0x00,0x00,0x00,0x00};
+/* nopl 0L(%[re]ax,%[re]ax,1) */
+static const unsigned char alt_8[] =
+  {0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00};
+/* nopw 0L(%[re]ax,%[re]ax,1) */
+static const unsigned char alt_9[] =
+  {0x66,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00};
+/* nopw %cs:0L(%[re]ax,%[re]ax,1) */
+static const unsigned char alt_10[] =
+  {0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00};
+/* data16 nopw %cs:0L(%eax,%eax,1) */
+static const unsigned char alt_11[] =
+  {0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00};
+/* 32-bit and 64-bit NOPs patterns.  */
+static const unsigned char *const alt_patt[] = {
+  f32_1, f32_2, alt_3, alt_4, alt_5, alt_6, alt_7, alt_8,
+  alt_9, alt_10, alt_11
+};
  
-  /* Only align for at least a positive non-zero boundary. */
-  if (count <= 0 || count > MAX_MEM_FOR_RS_ALIGN_CODE)
-    return;
+/* Genenerate COUNT bytes of NOPs to WHERE from PATT with the maximum
+   size of a single NOP instruction MAX_SINGLE_NOP_SIZE.  */
+
+static void
+i386_output_nops (char *where, const unsigned char *const *patt,
+                 int count, int max_single_nop_size)
+
+{
+  /* Place the longer NOP first.  */
+  int last;
+  int offset;
+  const unsigned char *nops =  patt[max_single_nop_size - 1];
+
+  /* Use the smaller one if the requsted one isn't available.  */
+  if (nops == NULL)
+    {
+      max_single_nop_size--;
+      nops = patt[max_single_nop_size - 1];
+    }
+
+  last = count % max_single_nop_size;
+
+  count -= last;
+  for (offset = 0; offset < count; offset += max_single_nop_size)
+    memcpy (where + offset, nops, max_single_nop_size);
+
+  if (last)
+    {
+      nops = patt[last - 1];
+      if (nops == NULL)
+       {
+         /* Use the smaller one plus one-byte NOP if the needed one
+            isn't available.  */
+         last--;
+         nops = patt[last - 1];
+         memcpy (where + offset, nops, last);
+         where[offset + last] = *patt[0];
+       }
+      else
+       memcpy (where + offset, nops, last);
+    }
+}
+
+static INLINE int
+fits_in_imm7 (offsetT num)
+{
+  return (num & 0x7f) == num;
+}
+
+static INLINE int
+fits_in_imm31 (offsetT num)
+{
+  return (num & 0x7fffffff) == num;
+}
+
+/* Genenerate COUNT bytes of NOPs to WHERE with the maximum size of a
+   single NOP instruction LIMIT.  */
+
+void
+i386_generate_nops (fragS *fragP, char *where, offsetT count, int limit)
+{
+  const unsigned char *const *patt = NULL;
+  int max_single_nop_size;
+  /* Maximum number of NOPs before switching to jump over NOPs.  */
+  int max_number_of_nops;
+
+  switch (fragP->fr_type)
+    {
+    case rs_fill_nop:
+    case rs_align_code:
+      break;
+    default:
+      return;
+    }
  
    /* We need to decide which NOP sequence to use for 32bit and
       64bit. When -mtune= is used:
@@ -1261,21 +1319,13 @@ i386_align_code (fragS *fragP, int count)
  
    if (flag_code == CODE_16BIT)
      {
-      if (count > 8)
-       {
-         memcpy (fragP->fr_literal + fragP->fr_fix,
-                 jump_31, count);
-         /* Adjust jump offset.  */
-         fragP->fr_literal[fragP->fr_fix + 1] = count - 2;
-       }
-      else
-       memcpy (fragP->fr_literal + fragP->fr_fix,
-               f16_patt[count - 1], count);
+      patt = f16_patt;
+      max_single_nop_size = sizeof (f16_patt) / sizeof (f16_patt[0]);
+      /* Limit number of NOPs to 2 in 16-bit mode.  */
+      max_number_of_nops = 2;
      }
    else
      {
-      const unsigned char *const *patt = NULL;
-
        if (fragP->tc_frag_data.isa == PROCESSOR_UNKNOWN)
         {
           /* PROCESSOR_UNKNOWN means that all ISAs may be used.  */
@@ -1366,47 +1416,79 @@ i386_align_code (fragS *fragP, int count)
  
        if (patt == f32_patt)
         {
-         /* If the padding is less than 15 bytes, we use the normal
-            ones.  Otherwise, we use a jump instruction and adjust
-            its offset.   */
-         int limit;
+         max_single_nop_size = sizeof (f32_patt) / sizeof (f32_patt[0]);
+         /* Limit number of NOPs to 2 for older processors.  */
+         max_number_of_nops = 2;
+       }
+      else
+       {
+         max_single_nop_size = sizeof (alt_patt) / sizeof (alt_patt[0]);
+         /* Limit number of NOPs to 7 for newer processors.  */
+         max_number_of_nops = 7;
+       }
+    }
  
-         /* For 64bit, the limit is 3 bytes.  */
-         if (flag_code == CODE_64BIT
-             && fragP->tc_frag_data.isa_flags.bitfield.cpulm)
-           limit = 3;
-         else
-           limit = 15;
-         if (count < limit)
-           memcpy (fragP->fr_literal + fragP->fr_fix,
-                   patt[count - 1], count);
-         else
-           {
-             memcpy (fragP->fr_literal + fragP->fr_fix,
-                     jump_31, count);
-             /* Adjust jump offset.  */
-             fragP->fr_literal[fragP->fr_fix + 1] = count - 2;
-           }
+  if (limit == 0)
+    limit = max_single_nop_size;
+
+  if (fragP->fr_type == rs_fill_nop)
+    {
+      /* Output NOPs for .nop directive.  */
+      if (limit > max_single_nop_size)
+       {
+         as_bad_where (fragP->fr_file, fragP->fr_line,
+                       _("invalid single nop size: %d "
+                         "(expect within [0, %d])"),
+                       limit, max_single_nop_size);
+         return;
+       }
+    }
+  else
+    fragP->fr_var = count;
+
+  if ((count / max_single_nop_size) > max_number_of_nops)
+    {
+      /* Generate jump over NOPs.  */
+      offsetT disp = count - 2;
+      if (fits_in_imm7 (disp))
+       {
+         /* Use "jmp disp8" if possible.  */
+         count = disp;
+         where[0] = jump_disp8[0];
+         where[1] = count;
+         where += 2;
         }
        else
         {
-         /* Maximum length of an instruction is 10 byte.  If the
-            padding is greater than 10 bytes and we don't use jump,
-            we have to break it into smaller pieces.  */
-         int padding = count;
-         while (padding > 10)
+         unsigned int size_of_jump;
+
+         if (flag_code == CODE_16BIT)
             {
-             padding -= 10;
-             memcpy (fragP->fr_literal + fragP->fr_fix + padding,
-                     patt [9], 10);
+             where[0] = jump16_disp32[0];
+             where[1] = jump16_disp32[1];
+             size_of_jump = 2;
+           }
+         else
+           {
+             where[0] = jump32_disp32[0];
+             size_of_jump = 1;
             }
  
-         if (padding)
-           memcpy (fragP->fr_literal + fragP->fr_fix,
-                   patt [padding - 1], padding);
+         count -= size_of_jump + 4;
+         if (!fits_in_imm31 (count))
+           {
+             as_bad_where (fragP->fr_file, fragP->fr_line,
+                           _("jump over nop padding out of range"));
+             return;
+           }
+
+         md_number_to_chars (where + size_of_jump, count, 4);
+         where += size_of_jump + 4;
         }
      }
-  fragP->fr_var = count;
+
+  /* Generate multiple NOPs.  */
+  i386_output_nops (where, patt, count, limit);
  }
  
  static INLINE int
@@ -1599,15 +1681,9 @@ cpu_flags_and_not (i386_cpu_flags x, i386_cpu_flags y)
  
  #define CPU_FLAGS_ARCH_MATCH           0x1
  #define CPU_FLAGS_64BIT_MATCH          0x2
-#define CPU_FLAGS_AES_MATCH            0x4
-#define CPU_FLAGS_PCLMUL_MATCH         0x8
-#define CPU_FLAGS_AVX_MATCH           0x10
  
-#define CPU_FLAGS_32BIT_MATCH \
-  (CPU_FLAGS_ARCH_MATCH | CPU_FLAGS_AES_MATCH \
-   | CPU_FLAGS_PCLMUL_MATCH | CPU_FLAGS_AVX_MATCH)
  #define CPU_FLAGS_PERFECT_MATCH \
-  (CPU_FLAGS_32BIT_MATCH | CPU_FLAGS_64BIT_MATCH)
+  (CPU_FLAGS_ARCH_MATCH | CPU_FLAGS_64BIT_MATCH)
  
  /* Return CPU flags match bits. */
  
@@ -1623,55 +1699,42 @@ cpu_flags_match (const insn_template *t)
    if (cpu_flags_all_zero (&x))
      {
        /* This instruction is available on all archs.  */
-      match |= CPU_FLAGS_32BIT_MATCH;
+      match |= CPU_FLAGS_ARCH_MATCH;
      }
    else
      {
        /* This instruction is available only on some archs.  */
        i386_cpu_flags cpu = cpu_arch_flags;
  
+      /* AVX512VL is no standalone feature - match it and then strip it.  */
+      if (x.bitfield.cpuavx512vl && !cpu.bitfield.cpuavx512vl)
+       return match;
+      x.bitfield.cpuavx512vl = 0;
+
        cpu = cpu_flags_and (x, cpu);
        if (!cpu_flags_all_zero (&cpu))
         {
           if (x.bitfield.cpuavx)
             {
-             /* We only need to check AES/PCLMUL/SSE2AVX with AVX.  */
-             if (cpu.bitfield.cpuavx)
-               {
-                 /* Check SSE2AVX.  */
-                 if (!t->opcode_modifier.sse2avx|| sse2avx)
-                   {
-                     match |= (CPU_FLAGS_ARCH_MATCH
-                               | CPU_FLAGS_AVX_MATCH);
-                     /* Check AES.  */
-                     if (!x.bitfield.cpuaes || cpu.bitfield.cpuaes)
-                       match |= CPU_FLAGS_AES_MATCH;
-                     /* Check PCLMUL.  */
-                     if (!x.bitfield.cpupclmul
-                         || cpu.bitfield.cpupclmul)
-                       match |= CPU_FLAGS_PCLMUL_MATCH;
-                   }
-               }
-             else
+             /* We need to check a few extra flags with AVX.  */
+             if (cpu.bitfield.cpuavx
+                 && (!t->opcode_modifier.sse2avx || sse2avx)
+                 && (!x.bitfield.cpuaes || cpu.bitfield.cpuaes)
+                 && (!x.bitfield.cpugfni || cpu.bitfield.cpugfni)
+                 && (!x.bitfield.cpupclmul || cpu.bitfield.cpupclmul))
                 match |= CPU_FLAGS_ARCH_MATCH;
             }
-         else if (x.bitfield.cpuavx512vl)
+         else if (x.bitfield.cpuavx512f)
             {
-             /* Match AVX512VL.  */
-             if (cpu.bitfield.cpuavx512vl)
-               {
-                 /* Need another match.  */
-                 cpu.bitfield.cpuavx512vl = 0;
-                 if (!cpu_flags_all_zero (&cpu))
-                   match |= CPU_FLAGS_32BIT_MATCH;
-                 else
-                   match |= CPU_FLAGS_ARCH_MATCH;
-               }
-             else
+             /* We need to check a few extra flags with AVX512F.  */
+             if (cpu.bitfield.cpuavx512f
+                 && (!x.bitfield.cpugfni || cpu.bitfield.cpugfni)
+                 && (!x.bitfield.cpuvaes || cpu.bitfield.cpuvaes)
+                 && (!x.bitfield.cpuvpclmulqdq || cpu.bitfield.cpuvpclmulqdq))
                 match |= CPU_FLAGS_ARCH_MATCH;
             }
           else
-           match |= CPU_FLAGS_32BIT_MATCH;
+           match |= CPU_FLAGS_ARCH_MATCH;
         }
      }
    return match;
@@ -1697,6 +1760,26 @@ operand_type_and (i386_operand_type x, i386_operand_type y)
    return x;
  }
  
+static INLINE i386_operand_type
+operand_type_and_not (i386_operand_type x, i386_operand_type y)
+{
+  switch (ARRAY_SIZE (x.array))
+    {
+    case 3:
+      x.array [2] &= ~y.array [2];
+      /* Fall through.  */
+    case 2:
+      x.array [1] &= ~y.array [1];
+      /* Fall through.  */
+    case 1:
+      x.array [0] &= ~y.array [0];
+      break;
+    default:
+      abort ();
+    }
+  return x;
+}
+
  static INLINE i386_operand_type
  operand_type_or (i386_operand_type x, i386_operand_type y)
  {
@@ -1855,10 +1938,13 @@ match_mem_size (const insn_template *t, unsigned int j)
                    && !t->operand_types[j].bitfield.fword)
                /* For scalar opcode templates to allow register and memory
                   operands at the same time, some special casing is needed
-                 here.  */
+                 here.  Also for v{,p}broadcast*, {,v}pmov{s,z}*, and
+                 down-conversion vpmov*.  */
                || ((t->operand_types[j].bitfield.regsimd
                     && !t->opcode_modifier.broadcast
-                   && (t->operand_types[j].bitfield.dword
+                   && (t->operand_types[j].bitfield.byte
+                       || t->operand_types[j].bitfield.word
+                       || t->operand_types[j].bitfield.dword
                         || t->operand_types[j].bitfield.qword))
                    ? (i.types[j].bitfield.xmmword
                       || i.types[j].bitfield.ymmword
@@ -1919,7 +2005,7 @@ operand_size_match (const insn_template *t)
  
    if (match)
      return match;
-  else if (!t->opcode_modifier.d && !t->opcode_modifier.floatd)
+  else if (!t->opcode_modifier.d)
      {
  mismatch:
        i.error = operand_size_mismatch;
@@ -2540,6 +2626,10 @@ set_cpu_arch (int dummy ATTRIBUTE_UNUSED)
                   cpu_arch_flags = flags;
                   cpu_arch_isa_flags = flags;
                 }
+             else
+               cpu_arch_isa_flags
+                 = cpu_flags_or (cpu_arch_isa_flags,
+                                 cpu_arch[j].flags);
               (void) restore_line_pointer (e);
               demand_empty_rest_of_line ();
               return;
@@ -3365,6 +3455,14 @@ build_vex_prefix (const insn_template *t)
      }
  }
  
+static INLINE bfd_boolean
+is_evex_encoding (const insn_template *t)
+{
+  return t->opcode_modifier.evex
+        || t->opcode_modifier.broadcast || t->opcode_modifier.masking
+        || t->opcode_modifier.staticrounding || t->opcode_modifier.sae;
+}
+
  /* Build the EVEX prefix.  */
  
  static void
@@ -3499,6 +3597,29 @@ build_evex_prefix (void)
        /* Encode the vector length.  */
        unsigned int vec_length;
  
+      if (!i.tm.opcode_modifier.evex
+         || i.tm.opcode_modifier.evex == EVEXDYN)
+       {
+         unsigned int op;
+
+         vec_length = 0;
+         for (op = 0; op < i.tm.operands; ++op)
+           if (i.tm.operand_types[op].bitfield.xmmword
+               + i.tm.operand_types[op].bitfield.ymmword
+               + i.tm.operand_types[op].bitfield.zmmword > 1)
+             {
+               if (i.types[op].bitfield.zmmword)
+                 i.tm.opcode_modifier.evex = EVEX512;
+               else if (i.types[op].bitfield.ymmword)
+                 i.tm.opcode_modifier.evex = EVEX256;
+               else if (i.types[op].bitfield.xmmword)
+                 i.tm.opcode_modifier.evex = EVEX128;
+               else
+                 continue;
+               break;
+             }
+       }
+
        switch (i.tm.opcode_modifier.evex)
         {
         case EVEXLIG: /* LL' is ignored */
@@ -3596,7 +3717,8 @@ bad_register_operand:
    gas_assert (i.imm_operands <= 1
               && (i.operands <= 2
                   || ((i.tm.opcode_modifier.vex
-                      || i.tm.opcode_modifier.evex)
+                      || i.tm.opcode_modifier.vexopcode
+                      || is_evex_encoding (&i.tm))
                       && i.operands <= 4)));
  
    exp = &im_expressions[i.imm_operands++];
@@ -3645,6 +3767,185 @@ check_hle (void)
      }
  }
  
+/* Try the shortest encoding by shortening operand size.  */
+
+static void
+optimize_encoding (void)
+{
+  int j;
+
+  if (optimize_for_space
+      && i.reg_operands == 1
+      && i.imm_operands == 1
+      && !i.types[1].bitfield.byte
+      && i.op[0].imms->X_op == O_constant
+      && fits_in_imm7 (i.op[0].imms->X_add_number)
+      && ((i.tm.base_opcode == 0xa8
+          && i.tm.extension_opcode == None)
+         || (i.tm.base_opcode == 0xf6
+             && i.tm.extension_opcode == 0x0)))
+    {
+      /* Optimize: -Os:
+          test $imm7, %r64/%r32/%r16  -> test $imm7, %r8
+       */
+      unsigned int base_regnum = i.op[1].regs->reg_num;
+      if (flag_code == CODE_64BIT || base_regnum < 4)
+       {
+         i.types[1].bitfield.byte = 1;
+         /* Ignore the suffix.  */
+         i.suffix = 0;
+         if (base_regnum >= 4
+             && !(i.op[1].regs->reg_flags & RegRex))
+           {
+             /* Handle SP, BP, SI and DI registers.  */
+             if (i.types[1].bitfield.word)
+               j = 16;
+             else if (i.types[1].bitfield.dword)
+               j = 32;
+             else
+               j = 48;
+             i.op[1].regs -= j;
+           }
+       }
+    }
+  else if (flag_code == CODE_64BIT
+          && ((i.types[1].bitfield.qword
+               && i.reg_operands == 1
+               && i.imm_operands == 1
+               && i.op[0].imms->X_op == O_constant
+               && ((i.tm.base_opcode == 0xb0
+                    && i.tm.extension_opcode == None
+                    && fits_in_unsigned_long (i.op[0].imms->X_add_number))
+                   || (fits_in_imm31 (i.op[0].imms->X_add_number)
+                       && (((i.tm.base_opcode == 0x24
+                             || i.tm.base_opcode == 0xa8)
+                            && i.tm.extension_opcode == None)
+                           || (i.tm.base_opcode == 0x80
+                               && i.tm.extension_opcode == 0x4)
+                           || ((i.tm.base_opcode == 0xf6
+                                || i.tm.base_opcode == 0xc6)
+                               && i.tm.extension_opcode == 0x0)))))
+              || (i.types[0].bitfield.qword
+                  && ((i.reg_operands == 2
+                       && i.op[0].regs == i.op[1].regs
+                       && ((i.tm.base_opcode == 0x30
+                            || i.tm.base_opcode == 0x28)
+                           && i.tm.extension_opcode == None))
+                      || (i.reg_operands == 1
+                          && i.operands == 1
+                          && i.tm.base_opcode == 0x30
+                          && i.tm.extension_opcode == None)))))
+    {
+      /* Optimize: -O:
+          andq $imm31, %r64   -> andl $imm31, %r32
+          testq $imm31, %r64  -> testl $imm31, %r32
+          xorq %r64, %r64     -> xorl %r32, %r32
+          subq %r64, %r64     -> subl %r32, %r32
+          movq $imm31, %r64   -> movl $imm31, %r32
+          movq $imm32, %r64   -> movl $imm32, %r32
+        */
+      i.tm.opcode_modifier.norex64 = 1;
+      if (i.tm.base_opcode == 0xb0 || i.tm.base_opcode == 0xc6)
+       {
+         /* Handle
+              movq $imm31, %r64   -> movl $imm31, %r32
+              movq $imm32, %r64   -> movl $imm32, %r32
+          */
+         i.tm.operand_types[0].bitfield.imm32 = 1;
+         i.tm.operand_types[0].bitfield.imm32s = 0;
+         i.tm.operand_types[0].bitfield.imm64 = 0;
+         i.types[0].bitfield.imm32 = 1;
+         i.types[0].bitfield.imm32s = 0;
+         i.types[0].bitfield.imm64 = 0;
+         i.types[1].bitfield.dword = 1;
+         i.types[1].bitfield.qword = 0;
+         if (i.tm.base_opcode == 0xc6)
+           {
+             /* Handle
+                  movq $imm31, %r64   -> movl $imm31, %r32
+              */
+             i.tm.base_opcode = 0xb0;
+             i.tm.extension_opcode = None;
+             i.tm.opcode_modifier.shortform = 1;
+             i.tm.opcode_modifier.modrm = 0;
+           }
+       }
+    }
+  else if (optimize > 1
+          && i.reg_operands == 3
+          && i.op[0].regs == i.op[1].regs
+          && !i.types[2].bitfield.xmmword
+          && (i.tm.opcode_modifier.vex
+              || (!i.mask
+                  && !i.rounding
+                  && is_evex_encoding (&i.tm)
+                  && (i.vec_encoding != vex_encoding_evex
+                      || i.tm.cpu_flags.bitfield.cpuavx512vl
+                      || cpu_arch_isa_flags.bitfield.cpuavx512vl)))
+          && ((i.tm.base_opcode == 0x55
+               || i.tm.base_opcode == 0x6655
+               || i.tm.base_opcode == 0x66df
+               || i.tm.base_opcode == 0x57
+               || i.tm.base_opcode == 0x6657
+               || i.tm.base_opcode == 0x66ef
+               || i.tm.base_opcode == 0x66f8
+               || i.tm.base_opcode == 0x66f9
+               || i.tm.base_opcode == 0x66fa
+               || i.tm.base_opcode == 0x66fb)
+              && i.tm.extension_opcode == None))
+    {
+      /* Optimize: -O2:
+          VOP, one of vandnps, vandnpd, vxorps, vxorpd, vpsubb, vpsubd,
+          vpsubq and vpsubw:
+            EVEX VOP %zmmM, %zmmM, %zmmN
+              -> VEX VOP %xmmM, %xmmM, %xmmN (M and N < 16)
+              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+            EVEX VOP %ymmM, %ymmM, %ymmN
+              -> VEX VOP %xmmM, %xmmM, %xmmN (M and N < 16)
+              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+            VEX VOP %ymmM, %ymmM, %ymmN
+              -> VEX VOP %xmmM, %xmmM, %xmmN
+          VOP, one of vpandn and vpxor:
+            VEX VOP %ymmM, %ymmM, %ymmN
+              -> VEX VOP %xmmM, %xmmM, %xmmN
+          VOP, one of vpandnd and vpandnq:
+            EVEX VOP %zmmM, %zmmM, %zmmN
+              -> VEX vpandn %xmmM, %xmmM, %xmmN (M and N < 16)
+              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+            EVEX VOP %ymmM, %ymmM, %ymmN
+              -> VEX vpandn %xmmM, %xmmM, %xmmN (M and N < 16)
+              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+          VOP, one of vpxord and vpxorq:
+            EVEX VOP %zmmM, %zmmM, %zmmN
+              -> VEX vpxor %xmmM, %xmmM, %xmmN (M and N < 16)
+              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+            EVEX VOP %ymmM, %ymmM, %ymmN
+              -> VEX vpxor %xmmM, %xmmM, %xmmN (M and N < 16)
+              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+       */
+      if (is_evex_encoding (&i.tm))
+       {
+         if (i.vec_encoding == vex_encoding_evex)
+           i.tm.opcode_modifier.evex = EVEX128;
+         else
+           {
+             i.tm.opcode_modifier.vex = VEX128;
+             i.tm.opcode_modifier.vexw = VEXW0;
+             i.tm.opcode_modifier.evex = 0;
+           }
+       }
+      else
+       i.tm.opcode_modifier.vex = VEX128;
+
+      if (i.tm.opcode_modifier.vex)
+       for (j = 0; j < 3; j++)
+         {
+           i.types[j].bitfield.xmmword = 1;
+           i.types[j].bitfield.ymmword = 0;
+         }
+    }
+}
+
  /* This is the guts of the machine-dependent assembler.  LINE points to a
     machine dependent instruction.  This function is supposed to emit
     the frags/bytes it assembles to.  */
@@ -3722,12 +4023,16 @@ md_assemble (char *line)
  
    if (sse_check != check_none
        && !i.tm.opcode_modifier.noavx
+      && !i.tm.cpu_flags.bitfield.cpuavx
        && (i.tm.cpu_flags.bitfield.cpusse
           || i.tm.cpu_flags.bitfield.cpusse2
           || i.tm.cpu_flags.bitfield.cpusse3
           || i.tm.cpu_flags.bitfield.cpussse3
           || i.tm.cpu_flags.bitfield.cpusse4_1
-         || i.tm.cpu_flags.bitfield.cpusse4_2))
+         || i.tm.cpu_flags.bitfield.cpusse4_2
+         || i.tm.cpu_flags.bitfield.cpupclmul
+         || i.tm.cpu_flags.bitfield.cpuaes
+         || i.tm.cpu_flags.bitfield.cpugfni))
      {
        (sse_check == check_warning
         ? as_warn
@@ -3810,6 +4115,9 @@ md_assemble (char *line)
        i.disp_operands = 0;
      }
  
+  if (optimize && !i.no_optimize && i.tm.opcode_modifier.optimize)
+    optimize_encoding ();
+
    if (!process_suffix ())
      return;
  
@@ -3851,7 +4159,8 @@ md_assemble (char *line)
        as_warn (_("translating to `%sp'"), i.tm.name);
      }
  
-  if (i.tm.opcode_modifier.vex || i.tm.opcode_modifier.evex)
+  if (i.tm.opcode_modifier.vex || i.tm.opcode_modifier.vexopcode
+      || is_evex_encoding (&i.tm))
      {
        if (flag_code == CODE_16BIT)
         {
@@ -3928,6 +4237,26 @@ md_assemble (char *line)
         }
      }
  
+  if (i.rex == 0 && i.rex_encoding)
+    {
+      /* Check if we can add a REX_OPCODE byte.  Look for 8 bit operand
+         that uses legacy register.  If it is "hi" register, don't add
+        the REX_OPCODE byte.  */
+      int x;
+      for (x = 0; x < 2; x++)
+       if (i.types[x].bitfield.reg
+           && i.types[x].bitfield.byte
+           && (i.op[x].regs->reg_flags & RegRex64) == 0
+           && i.op[x].regs->reg_num > 3)
+         {
+           i.rex_encoding = FALSE;
+           break;
+         }
+
+      if (i.rex_encoding)
+       i.rex = REX_OPCODE;
+    }
+
    if (i.rex != 0)
      add_prefix (REX_OPCODE | i.rex);
  
@@ -4040,6 +4369,14 @@ parse_insn (char *line, char *mnemonic)
                   /* {evex} */
                   i.vec_encoding = vex_encoding_evex;
                   break;
+               case 0x7:
+                 /* {rex} */
+                 i.rex_encoding = TRUE;
+                 break;
+               case 0x8:
+                 /* {nooptimize} */
+                 i.no_optimize = TRUE;
+                 break;
                 default:
                   abort ();
                 }
@@ -4185,34 +4522,26 @@ check_suffix:
      {
        supported |= cpu_flags_match (t);
        if (supported == CPU_FLAGS_PERFECT_MATCH)
-       goto skip;
-    }
+       {
+         if (!cpu_arch_flags.bitfield.cpui386 && (flag_code != CODE_16BIT))
+           as_warn (_("use .code16 to ensure correct addressing mode"));
  
-  if (!(supported & CPU_FLAGS_64BIT_MATCH))
-    {
-      as_bad (flag_code == CODE_64BIT
-             ? _("`%s' is not supported in 64-bit mode")
-             : _("`%s' is only supported in 64-bit mode"),
-             current_templates->start->name);
-      return NULL;
-    }
-  if (supported != CPU_FLAGS_PERFECT_MATCH)
-    {
-      as_bad (_("`%s' is not supported on `%s%s'"),
-             current_templates->start->name,
-             cpu_arch_name ? cpu_arch_name : default_arch,
-             cpu_sub_arch_name ? cpu_sub_arch_name : "");
-      return NULL;
+         return l;
+       }
      }
  
-skip:
-  if (!cpu_arch_flags.bitfield.cpui386
-          && (flag_code != CODE_16BIT))
-    {
-      as_warn (_("use .code16 to ensure correct addressing mode"));
-    }
+  if (!(supported & CPU_FLAGS_64BIT_MATCH))
+    as_bad (flag_code == CODE_64BIT
+           ? _("`%s' is not supported in 64-bit mode")
+           : _("`%s' is only supported in 64-bit mode"),
+           current_templates->start->name);
+  else
+    as_bad (_("`%s' is not supported on `%s%s'"),
+           current_templates->start->name,
+           cpu_arch_name ? cpu_arch_name : default_arch,
+           cpu_sub_arch_name ? cpu_sub_arch_name : "");
  
-  return l;
+  return NULL;
  }
  
  static char *
@@ -4720,42 +5049,60 @@ check_VecOperands (const insn_template *t)
       to the memory operand.  */
    if (i.broadcast)
      {
-      int broadcasted_opnd_size;
+      i386_operand_type type, overlap;
  
        /* Check if specified broadcast is supported in this instruction,
-        and it's applied to memory operand of DWORD or QWORD type,
-        depending on VecESize.  */
-      if (i.broadcast->type != t->opcode_modifier.broadcast
-         || !i.types[i.broadcast->operand].bitfield.mem
-         || (t->opcode_modifier.vecesize == 0
-             && !i.types[i.broadcast->operand].bitfield.dword
-             && !i.types[i.broadcast->operand].bitfield.unspecified)
-         || (t->opcode_modifier.vecesize == 1
-             && !i.types[i.broadcast->operand].bitfield.qword
-             && !i.types[i.broadcast->operand].bitfield.unspecified))
-       goto bad_broadcast;
-
-      broadcasted_opnd_size = t->opcode_modifier.vecesize ? 64 : 32;
-      if (i.broadcast->type == BROADCAST_1TO16)
-       broadcasted_opnd_size <<= 4; /* Broadcast 1to16.  */
-      else if (i.broadcast->type == BROADCAST_1TO8)
-       broadcasted_opnd_size <<= 3; /* Broadcast 1to8.  */
-      else if (i.broadcast->type == BROADCAST_1TO4)
-       broadcasted_opnd_size <<= 2; /* Broadcast 1to4.  */
-      else if (i.broadcast->type == BROADCAST_1TO2)
-       broadcasted_opnd_size <<= 1; /* Broadcast 1to2.  */
-      else
-       goto bad_broadcast;
-
-      if ((broadcasted_opnd_size == 256
-          && !t->operand_types[i.broadcast->operand].bitfield.ymmword)
-         || (broadcasted_opnd_size == 512
-             && !t->operand_types[i.broadcast->operand].bitfield.zmmword))
+        and it's applied to memory operand of DWORD or QWORD type.  */
+      op = i.broadcast->operand;
+      if (!t->opcode_modifier.broadcast
+         || !i.types[op].bitfield.mem
+         || (!i.types[op].bitfield.unspecified
+             && (t->operand_types[op].bitfield.dword
+                 ? !i.types[op].bitfield.dword
+                 : !i.types[op].bitfield.qword)))
         {
         bad_broadcast:
           i.error = unsupported_broadcast;
           return 1;
         }
+
+      operand_type_set (&type, 0);
+      switch ((t->operand_types[op].bitfield.dword ? 4 : 8) * i.broadcast->type)
+       {
+       case 8:
+         type.bitfield.qword = 1;
+         break;
+       case 16:
+         type.bitfield.xmmword = 1;
+         break;
+       case 32:
+         type.bitfield.ymmword = 1;
+         break;
+       case 64:
+         type.bitfield.zmmword = 1;
+         break;
+       default:
+         goto bad_broadcast;
+       }
+
+      overlap = operand_type_and (type, t->operand_types[op]);
+      if (operand_type_all_zero (&overlap))
+         goto bad_broadcast;
+
+      if (t->opcode_modifier.checkregsize)
+       {
+         unsigned int j;
+
+         for (j = 0; j < i.operands; ++j)
+           {
+             if (j != op
+                 && !operand_type_register_match(i.types[j],
+                                                 t->operand_types[j],
+                                                 type,
+                                                 t->operand_types[op]))
+               goto bad_broadcast;
+           }
+       }
      }
    /* If broadcast is supported in this instruction, we need to check if
       operand of one-element size isn't specified without broadcast.  */
@@ -4767,15 +5114,16 @@ check_VecOperands (const insn_template *t)
           break;
        gas_assert (op < i.operands);
        /* Check size of the memory operand.  */
-      if ((t->opcode_modifier.vecesize == 0
-          && i.types[op].bitfield.dword)
-         || (t->opcode_modifier.vecesize == 1
-             && i.types[op].bitfield.qword))
+      if (t->operand_types[op].bitfield.dword
+         ? i.types[op].bitfield.dword
+         : i.types[op].bitfield.qword)
         {
           i.error = broadcast_needed;
           return 1;
         }
      }
+  else
+    op = MAX_OPERANDS - 1; /* Avoid uninitialized variable warning.  */
  
    /* Check if requested masking is supported.  */
    if (i.mask
@@ -4822,7 +5170,7 @@ check_VecOperands (const insn_template *t)
        && i.disp_encoding != disp_encoding_32bit)
      {
        if (i.broadcast)
-       i.memshift = t->opcode_modifier.vecesize ? 3 : 2;
+       i.memshift = t->operand_types[op].bitfield.dword ? 2 : 3;
        else
         i.memshift = t->opcode_modifier.disp8memshift;
  
@@ -4853,7 +5201,7 @@ VEX_check_operands (const insn_template *t)
    if (i.vec_encoding == vex_encoding_evex)
      {
        /* This instruction must be encoded with EVEX prefix.  */
-      if (!t->opcode_modifier.evex)
+      if (!is_evex_encoding (t))
         {
           i.error = unsupported;
           return 1;
@@ -4956,11 +5304,6 @@ match_template (char mnem_suffix)
        if (!found_cpu_match)
         continue;
  
-      /* Check old gcc support. */
-      i.error = old_gcc_only;
-      if (!old_gcc && t->opcode_modifier.oldgcc)
-       continue;
-
        /* Check AT&T mnemonic.   */
        i.error = unsupported_with_intel_mnemonic;
        if (intel_mnemonic && t->opcode_modifier.attmnemonic)
@@ -5104,6 +5447,14 @@ match_template (char mnem_suffix)
               && operand_type_equal (&i.types [0], &acc32)
               && operand_type_equal (&i.types [1], &acc32))
             continue;
+         /* xrelease mov %eax, <disp> is another special case. It must not
+            match the accumulator-only encoding of mov.  */
+         if (flag_code != CODE_64BIT
+             && i.hle_prefix
+             && t->base_opcode == 0xa0
+             && i.types[0].bitfield.acc
+             && operand_type_check (i.types[1], anymem))
+           continue;
           /* If we want store form, we reverse direction of operands.  */
           if (i.dir_encoding == dir_encoding_store
               && t->opcode_modifier.d)
@@ -5129,7 +5480,7 @@ match_template (char mnem_suffix)
                                                    operand_types[1])))
             {
               /* Check if other direction is valid ...  */
-             if (!t->opcode_modifier.d && !t->opcode_modifier.floatd)
+             if (!t->opcode_modifier.d)
                 continue;
  
  check_reverse:
@@ -5147,14 +5498,14 @@ check_reverse:
                   /* Does not match either direction.  */
                   continue;
                 }
-             /* found_reverse_match holds which of D or FloatDR
+             /* found_reverse_match holds which of D or FloatR
                  we've found.  */
-             if (t->opcode_modifier.d)
-               found_reverse_match = Opcode_D;
-             else if (t->opcode_modifier.floatd)
+             if (!t->opcode_modifier.d)
+               found_reverse_match = 0;
+             else if (operand_types[0].bitfield.tbyte)
                 found_reverse_match = Opcode_FloatD;
               else
-               found_reverse_match = 0;
+               found_reverse_match = Opcode_D;
               if (t->opcode_modifier.floatr)
                 found_reverse_match |= Opcode_FloatR;
             }
@@ -5190,23 +5541,29 @@ check_reverse:
                 case 4:
                   if (!operand_type_match (overlap3, i.types[3])
                       || (check_register
-                         && !operand_type_register_match (i.types[2],
-                                                          operand_types[2],
-                                                          i.types[3],
-                                                          operand_types[3])))
+                         && (!operand_type_register_match (i.types[1],
+                                                           operand_types[1],
+                                                           i.types[3],
+                                                           operand_types[3])
+                             || !operand_type_register_match (i.types[2],
+                                                              operand_types[2],
+                                                              i.types[3],
+                                                              operand_types[3]))))
                     continue;
                   /* Fall through.  */
                 case 3:
                   /* Here we make use of the fact that there are no
-                    reverse match 3 operand instructions, and all 3
-                    operand instructions only need to be checked for
-                    register consistency between operands 2 and 3.  */
+                    reverse match 3 operand instructions.  */
                   if (!operand_type_match (overlap2, i.types[2])
                       || (check_register
-                         && !operand_type_register_match (i.types[1],
-                                                          operand_types[1],
-                                                          i.types[2],
-                                                          operand_types[2])))
+                         && (!operand_type_register_match (i.types[0],
+                                                           operand_types[0],
+                                                           i.types[2],
+                                                           operand_types[2])
+                             || !operand_type_register_match (i.types[1],
+                                                              operand_types[1],
+                                                              i.types[2],
+                                                              operand_types[2]))))
                     continue;
                   break;
                 }
@@ -5257,9 +5614,6 @@ check_reverse:
         case bad_imm4:
           err_msg = _("constant doesn't fit in 4 bits");
           break;
-       case old_gcc_only:
-         err_msg = _("only supported with old gcc");
-         break;
         case unsupported_with_intel_mnemonic:
           err_msg = _("unsupported with Intel mnemonic");
           break;
@@ -5442,26 +5796,19 @@ process_suffix (void)
                 if (!i.tm.operand_types[op].bitfield.inoutportreg
                     && !i.tm.operand_types[op].bitfield.shiftcount)
                   {
-                   if (i.types[op].bitfield.reg && i.types[op].bitfield.byte)
-                     {
-                       i.suffix = BYTE_MNEM_SUFFIX;
-                       break;
-                     }
-                   if (i.types[op].bitfield.reg && i.types[op].bitfield.word)
-                     {
-                       i.suffix = WORD_MNEM_SUFFIX;
-                       break;
-                     }
-                   if (i.types[op].bitfield.reg && i.types[op].bitfield.dword)
-                     {
-                       i.suffix = LONG_MNEM_SUFFIX;
-                       break;
-                     }
-                   if (i.types[op].bitfield.reg && i.types[op].bitfield.qword)
-                     {
-                       i.suffix = QWORD_MNEM_SUFFIX;
-                       break;
-                     }
+                   if (!i.types[op].bitfield.reg)
+                     continue;
+                   if (i.types[op].bitfield.byte)
+                     i.suffix = BYTE_MNEM_SUFFIX;
+                   else if (i.types[op].bitfield.word)
+                     i.suffix = WORD_MNEM_SUFFIX;
+                   else if (i.types[op].bitfield.dword)
+                     i.suffix = LONG_MNEM_SUFFIX;
+                   else if (i.types[op].bitfield.qword)
+                     i.suffix = QWORD_MNEM_SUFFIX;
+                   else
+                     continue;
+                   break;
                   }
             }
         }
@@ -5478,7 +5825,9 @@ process_suffix (void)
         {
           if (intel_syntax
               && i.tm.opcode_modifier.ignoresize
-             && i.tm.opcode_modifier.no_lsuf)
+             && i.tm.opcode_modifier.no_lsuf
+             && !i.tm.opcode_modifier.todword
+             && !i.tm.opcode_modifier.toqword)
             i.suffix = 0;
           else if (!check_long_reg ())
             return 0;
@@ -5487,7 +5836,9 @@ process_suffix (void)
         {
           if (intel_syntax
               && i.tm.opcode_modifier.ignoresize
-             && i.tm.opcode_modifier.no_qsuf)
+             && i.tm.opcode_modifier.no_qsuf
+             && !i.tm.opcode_modifier.todword
+             && !i.tm.opcode_modifier.toqword)
             i.suffix = 0;
           else if (!check_qword_reg ())
             return 0;
@@ -5501,13 +5852,6 @@ process_suffix (void)
           else if (!check_word_reg ())
             return 0;
         }
-      else if (i.suffix == XMMWORD_MNEM_SUFFIX
-              || i.suffix == YMMWORD_MNEM_SUFFIX
-              || i.suffix == ZMMWORD_MNEM_SUFFIX)
-       {
-         /* Skip if the instruction has x/y/z suffix.  match_template
-            should check if it is a valid suffix.  */
-       }
        else if (intel_syntax && i.tm.opcode_modifier.ignoresize)
         /* Do nothing if the instruction is going to ignore the prefix.  */
         ;
@@ -5588,15 +5932,19 @@ process_suffix (void)
         }
      }
  
-  /* Change the opcode based on the operand size given by i.suffix;
-     We don't need to change things for byte insns.  */
-
-  if (i.suffix
-      && i.suffix != BYTE_MNEM_SUFFIX
-      && i.suffix != XMMWORD_MNEM_SUFFIX
-      && i.suffix != YMMWORD_MNEM_SUFFIX
-      && i.suffix != ZMMWORD_MNEM_SUFFIX)
+  /* Change the opcode based on the operand size given by i.suffix.  */
+  switch (i.suffix)
      {
+    /* Size floating point instruction.  */
+    case LONG_MNEM_SUFFIX:
+      if (i.tm.opcode_modifier.floatmf)
+       {
+         i.tm.base_opcode ^= 4;
+         break;
+       }
+    /* fall through */
+    case WORD_MNEM_SUFFIX:
+    case QWORD_MNEM_SUFFIX:
        /* It's not a byte, select word/dword operation.  */
        if (i.tm.opcode_modifier.w)
         {
@@ -5605,7 +5953,8 @@ process_suffix (void)
           else
             i.tm.base_opcode |= 1;
         }
-
+    /* fall through */
+    case SHORT_MNEM_SUFFIX:
        /* Now select between word & dword operations via the operand
          size prefix, except for instructions that will ignore this
          prefix anyway.  */
@@ -5621,7 +5970,6 @@ process_suffix (void)
               return 0;
         }
        else if (i.suffix != QWORD_MNEM_SUFFIX
-              && i.suffix != LONG_DOUBLE_MNEM_SUFFIX
                && !i.tm.opcode_modifier.ignoresize
                && !i.tm.opcode_modifier.floatmf
                && ((i.suffix == LONG_MNEM_SUFFIX) == (flag_code == CODE_16BIT)
@@ -5640,27 +5988,17 @@ process_suffix (void)
        /* Set mode64 for an operand.  */
        if (i.suffix == QWORD_MNEM_SUFFIX
           && flag_code == CODE_64BIT
-         && !i.tm.opcode_modifier.norex64)
-       {
+         && !i.tm.opcode_modifier.norex64
           /* Special case for xchg %rax,%rax.  It is NOP and doesn't
-            need rex64.  cmpxchg8b is also a special case. */
-         if (! (i.operands == 2
-                && i.tm.base_opcode == 0x90
-                && i.tm.extension_opcode == None
-                && operand_type_equal (&i.types [0], &acc64)
-                && operand_type_equal (&i.types [1], &acc64))
-             && ! (i.operands == 1
-                   && i.tm.base_opcode == 0xfc7
-                   && i.tm.extension_opcode == 1
-                   && !operand_type_check (i.types [0], reg)
-                   && operand_type_check (i.types [0], anymem)))
-           i.rex |= REX_W;
-       }
-
-      /* Size floating point instruction.  */
-      if (i.suffix == LONG_MNEM_SUFFIX)
-       if (i.tm.opcode_modifier.floatmf)
-         i.tm.base_opcode ^= 4;
+            need rex64. */
+         && ! (i.operands == 2
+               && i.tm.base_opcode == 0x90
+               && i.tm.extension_opcode == None
+               && operand_type_equal (&i.types [0], &acc64)
+               && operand_type_equal (&i.types [1], &acc64)))
+       i.rex |= REX_W;
+
+      break;
      }
  
    return 1;
@@ -6097,20 +6435,21 @@ duplicate:
      }
    else if (i.tm.opcode_modifier.implicitquadgroup)
      {
+      unsigned int regnum, first_reg_in_group, last_reg_in_group;
+
        /* The second operand must be {x,y,z}mmN, where N is a multiple of 4. */
        gas_assert (i.operands >= 2 && i.types[1].bitfield.regsimd);
-      unsigned int regnum = register_number (i.op[1].regs);
-      unsigned int first_reg_in_group = regnum & ~3;
-      unsigned int last_reg_in_group = first_reg_in_group + 3;
-      if (regnum != first_reg_in_group) {
-        as_warn (_("the second source register `%s%s' implicitly denotes"
-            " `%s%.3s%d' to `%s%.3s%d' source group in `%s'"),
-            register_prefix, i.op[1].regs->reg_name,
-            register_prefix, i.op[1].regs->reg_name, first_reg_in_group,
-            register_prefix, i.op[1].regs->reg_name, last_reg_in_group,
-            i.tm.name);
-      }
-       }
+      regnum = register_number (i.op[1].regs);
+      first_reg_in_group = regnum & ~3;
+      last_reg_in_group = first_reg_in_group + 3;
+      if (regnum != first_reg_in_group)
+       as_warn (_("source register `%s%s' implicitly denotes"
+                  " `%s%.3s%u' to `%s%.3s%u' source group in `%s'"),
+                register_prefix, i.op[1].regs->reg_name,
+                register_prefix, i.op[1].regs->reg_name, first_reg_in_group,
+                register_prefix, i.op[1].regs->reg_name, last_reg_in_group,
+                i.tm.name);
+    }
    else if (i.tm.opcode_modifier.regkludge)
      {
        /* The imul $imm, %reg instruction is converted into
@@ -6405,7 +6744,7 @@ build_modrm_byte (void)
             }
           break;
         case 5:
-         if (i.tm.opcode_modifier.evex)
+         if (is_evex_encoding (&i.tm))
             {
               /* For EVEX instructions, when there are 5 operands, the
                  first one must be immediate operand.  If the second one
@@ -6570,6 +6909,8 @@ build_modrm_byte (void)
                 fake_zero_displacement = 1;
               if (i.index_reg == 0)
                 {
+                 i386_operand_type newdisp;
+
                   gas_assert (!i.tm.opcode_modifier.vecsib);
                   /* Operand is just <disp>  */
                   if (flag_code == CODE_64BIT)
@@ -6581,20 +6922,21 @@ build_modrm_byte (void)
                       i.rm.regmem = ESCAPE_TO_TWO_BYTE_ADDRESSING;
                       i.sib.base = NO_BASE_REGISTER;
                       i.sib.index = NO_INDEX_REGISTER;
-                     i.types[op] = ((i.prefix[ADDR_PREFIX] == 0)
-                                    ? disp32s : disp32);
+                     newdisp = (!i.prefix[ADDR_PREFIX] ? disp32s : disp32);
                     }
                   else if ((flag_code == CODE_16BIT)
                            ^ (i.prefix[ADDR_PREFIX] != 0))
                     {
                       i.rm.regmem = NO_BASE_REGISTER_16;
-                     i.types[op] = disp16;
+                     newdisp = disp16;
                     }
                   else
                     {
                       i.rm.regmem = NO_BASE_REGISTER;
-                     i.types[op] = disp32;
+                     newdisp = disp32;
                     }
+                 i.types[op] = operand_type_and_not (i.types[op], anydisp);
+                 i.types[op] = operand_type_or (i.types[op], newdisp);
                 }
               else if (!i.tm.opcode_modifier.vecsib)
                 {
@@ -6676,14 +7018,18 @@ build_modrm_byte (void)
               if (flag_code == CODE_64BIT
                   && operand_type_check (i.types[op], disp))
                 {
-                 i386_operand_type temp;
-                 operand_type_set (&temp, 0);
-                 temp.bitfield.disp8 = i.types[op].bitfield.disp8;
-                 i.types[op] = temp;
+                 i.types[op].bitfield.disp16 = 0;
+                 i.types[op].bitfield.disp64 = 0;
                   if (i.prefix[ADDR_PREFIX] == 0)
-                   i.types[op].bitfield.disp32s = 1;
+                   {
+                     i.types[op].bitfield.disp32 = 0;
+                     i.types[op].bitfield.disp32s = 1;
+                   }
                   else
-                   i.types[op].bitfield.disp32 = 1;
+                   {
+                     i.types[op].bitfield.disp32 = 1;
+                     i.types[op].bitfield.disp32s = 0;
+                   }
                 }
  
               if (!i.tm.opcode_modifier.vecsib)
@@ -6876,9 +7222,10 @@ build_modrm_byte (void)
                 }
               else
                 {
-                 /* There are only 2 operands.  */
-                 gas_assert (op < 2 && i.operands == 2);
-                 vex_reg = 1;
+                 /* There are only 2 non-immediate operands.  */
+                 gas_assert (op < i.imm_operands + 2
+                             && i.operands == i.imm_operands + 2);
+                 vex_reg = i.imm_operands + 1;
                 }
             }
           else
@@ -7019,12 +7366,46 @@ output_branch (void)
    frag_var (rs_machine_dependent, 5, i.reloc[0], subtype, sym, off, p);
  }
  
+#if defined (OBJ_ELF) || defined (OBJ_MAYBE_ELF)
+/* Return TRUE iff PLT32 relocation should be used for branching to
+   symbol S.  */
+
+static bfd_boolean
+need_plt32_p (symbolS *s)
+{
+  /* PLT32 relocation is ELF only.  */
+  if (!IS_ELF)
+    return FALSE;
+
+  /* Since there is no need to prepare for PLT branch on x86-64, we
+     can generate R_X86_64_PLT32, instead of R_X86_64_PC32, which can
+     be used as a marker for 32-bit PC-relative branches.  */
+  if (!object_64bit)
+    return FALSE;
+
+  /* Weak or undefined symbol need PLT32 relocation.  */
+  if (S_IS_WEAK (s) || !S_IS_DEFINED (s))
+    return TRUE;
+
+  /* Non-global symbol doesn't need PLT32 relocation.  */
+  if (! S_IS_EXTERNAL (s))
+    return FALSE;
+
+  /* Other global symbols need PLT32 relocation.  NB: Symbol with
+     non-default visibilities are treated as normal global symbol
+     so that PLT32 relocation can be used as a marker for 32-bit
+     PC-relative branches.  It is useful for linker relaxation.  */
+  return TRUE;
+}
+#endif
+
  static void
  output_jump (void)
  {
    char *p;
    int size;
    fixS *fixP;
+  bfd_reloc_code_real_type jump_reloc = i.reloc[0];
  
    if (i.tm.opcode_modifier.jumpbyte)
      {
@@ -7092,8 +7473,17 @@ output_jump (void)
        abort ();
      }
  
+#if defined (OBJ_ELF) || defined (OBJ_MAYBE_ELF)
+  if (size == 4
+      && jump_reloc == NO_RELOC
+      && need_plt32_p (i.op[0].disps->X_add_symbol))
+    jump_reloc = BFD_RELOC_X86_64_PLT32;
+#endif
+
+  jump_reloc = reloc (size, 1, 1, jump_reloc);
+
    fixP = fix_new_exp (frag_now, p - frag_now->fr_literal, size,
-                     i.op[0].disps, 1, reloc (size, 1, 1, i.reloc[0]));
+                     i.op[0].disps, 1, jump_reloc);
  
    /* All jumps handled here are signed, but don't use a signed limit
       check for 32 and 16 bit jumps as we want to allow wrap around at
@@ -8074,15 +8464,15 @@ check_VecOperations (char *op_string, char *op_end)
  
               op_string += 3;
               if (*op_string == '8')
-               bcst_type = BROADCAST_1TO8;
+               bcst_type = 8;
               else if (*op_string == '4')
-               bcst_type = BROADCAST_1TO4;
+               bcst_type = 4;
               else if (*op_string == '2')
-               bcst_type = BROADCAST_1TO2;
+               bcst_type = 2;
               else if (*op_string == '1'
                        && *(op_string+1) == '6')
                 {
-                 bcst_type = BROADCAST_1TO16;
+                 bcst_type = 16;
                   op_string++;
                 }
               else
@@ -8174,6 +8564,12 @@ check_VecOperations (char *op_string, char *op_end)
               return NULL;
             }
           op_string++;
+
+         /* Strip whitespace since the addition of pseudo prefixes
+            changed how the scrubber treats '{'.  */
+         if (is_space_char (*op_string))
+           ++op_string;
+
           continue;
         }
      unknown_vec_op:
@@ -9311,6 +9707,10 @@ md_estimate_size_before_relax (fragS *fragP, segT segment)
         reloc_type = (enum bfd_reloc_code_real) fragP->fr_var;
        else if (size == 2)
         reloc_type = BFD_RELOC_16_PCREL;
+#if defined (OBJ_ELF) || defined (OBJ_MAYBE_ELF)
+      else if (need_plt32_p (fragP->fr_symbol))
+       reloc_type = BFD_RELOC_X86_64_PLT32;
+#endif
        else
         reloc_type = BFD_RELOC_32_PCREL;
  
@@ -9936,9 +10336,9 @@ md_operand (expressionS *e)
  
  \f
  #if defined (OBJ_ELF) || defined (OBJ_MAYBE_ELF)
-const char *md_shortopts = "kVQ:sqn";
+const char *md_shortopts = "kVQ:sqnO::";
  #else
-const char *md_shortopts = "qn";
+const char *md_shortopts = "qnO::";
  #endif
  
  #define OPTION_32 (OPTION_MD_BASE + 0)
@@ -9950,7 +10350,7 @@ const char *md_shortopts = "qn";
  #define OPTION_MSYNTAX (OPTION_MD_BASE + 6)
  #define OPTION_MINDEX_REG (OPTION_MD_BASE + 7)
  #define OPTION_MNAKED_REG (OPTION_MD_BASE + 8)
-#define OPTION_MOLD_GCC (OPTION_MD_BASE + 9)
+#define OPTION_MRELAX_RELOCATIONS (OPTION_MD_BASE + 9)
  #define OPTION_MSSE2AVX (OPTION_MD_BASE + 10)
  #define OPTION_MSSE_CHECK (OPTION_MD_BASE + 11)
  #define OPTION_MOPERAND_CHECK (OPTION_MD_BASE + 12)
@@ -9966,7 +10366,6 @@ const char *md_shortopts = "qn";
  #define OPTION_MAMD64 (OPTION_MD_BASE + 22)
  #define OPTION_MINTEL64 (OPTION_MD_BASE + 23)
  #define OPTION_MFENCE_AS_LOCK_ADD (OPTION_MD_BASE + 24)
-#define OPTION_MRELAX_RELOCATIONS (OPTION_MD_BASE + 25)
  
  struct option md_longopts[] =
  {
@@ -9986,7 +10385,6 @@ struct option md_longopts[] =
    {"msyntax", required_argument, NULL, OPTION_MSYNTAX},
    {"mindex-reg", no_argument, NULL, OPTION_MINDEX_REG},
    {"mnaked-reg", no_argument, NULL, OPTION_MNAKED_REG},
-  {"mold-gcc", no_argument, NULL, OPTION_MOLD_GCC},
    {"msse2avx", no_argument, NULL, OPTION_MSSE2AVX},
    {"msse-check", required_argument, NULL, OPTION_MSSE_CHECK},
    {"moperand-check", required_argument, NULL, OPTION_MOPERAND_CHECK},
@@ -10171,6 +10569,10 @@ md_parse_option (int c, const char *arg)
                       cpu_arch_flags = flags;
                       cpu_arch_isa_flags = flags;
                     }
+                 else
+                   cpu_arch_isa_flags
+                     = cpu_flags_or (cpu_arch_isa_flags,
+                                     cpu_arch[j].flags);
                   break;
                 }
             }
@@ -10258,10 +10660,6 @@ md_parse_option (int c, const char *arg)
        allow_naked_reg = 1;
        break;
  
-    case OPTION_MOLD_GCC:
-      old_gcc = 1;
-      break;
-
      case OPTION_MSSE2AVX:
        sse2avx = 1;
        break;
@@ -10375,6 +10773,27 @@ md_parse_option (int c, const char *arg)
        intel64 = 1;
        break;
  
+    case 'O':
+      if (arg == NULL)
+       {
+         optimize = 1;
+         /* Turn off -Os.  */
+         optimize_for_space = 0;
+       }
+      else if (*arg == 's')
+       {
+         optimize_for_space = 1;
+         /* Turn on all encoding optimizations.  */
+         optimize = -1;
+       }
+      else
+       {
+         optimize = atoi (arg);
+         /* Turn off -Os.  */
+         optimize_for_space = 0;
+       }
+      break;
+
      default:
        return 0;
      }
@@ -10551,8 +10970,6 @@ md_show_usage (FILE *stream)
    fprintf (stream, _("\
    -mnaked-reg             don't require `%%' prefix for registers\n"));
    fprintf (stream, _("\
-  -mold-gcc               support old (<= 2.8.1) versions of gcc\n"));
-  fprintf (stream, _("\
    -madd-bnd-prefix        add BND prefix for all valid branches\n"));
    fprintf (stream, _("\
    -mshared                disable branch optimization for shared code\n"));