x86: Correct EVEX to 128-bit EVEX optimization

[deliverable/binutils-gdb.git] / gas / config / tc-i386.c
diff --git a/gas/config/tc-i386.c b/gas/config/tc-i386.c

index 81643a73ac4aaf9909ac54716f9605c909e83542..3885728de7b9d25567c4cac477fe25f796476fa7 100644 (file)
--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -1,5 +1,5 @@
  /* tc-i386.c -- Assemble code for the Intel 80386
-   Copyright (C) 1989-2018 Free Software Foundation, Inc.
+   Copyright (C) 1989-2019 Free Software Foundation, Inc.
  
     This file is part of GAS, the GNU Assembler.
  
@@ -33,6 +33,17 @@
  #include "elf/x86-64.h"
  #include "opcodes/i386-init.h"
  
+#ifdef HAVE_LIMITS_H
+#include <limits.h>
+#else
+#ifdef HAVE_SYS_PARAM_H
+#include <sys/param.h>
+#endif
+#ifndef INT_MAX
+#define INT_MAX (int) (((unsigned) (-1)) >> 1)
+#endif
+#endif
+
  #ifndef REGISTER_WARNINGS
  #define REGISTER_WARNINGS 1
  #endif
@@ -188,6 +199,13 @@ static void s_bss (int);
  #endif
  #if defined (OBJ_ELF) || defined (OBJ_MAYBE_ELF)
  static void handle_large_common (int small ATTRIBUTE_UNUSED);
+
+/* GNU_PROPERTY_X86_ISA_1_USED.  */
+static unsigned int x86_isa_1_used;
+/* GNU_PROPERTY_X86_FEATURE_2_USED.  */
+static unsigned int x86_feature_2_used;
+/* Generate x86 used ISA and feature properties.  */
+static unsigned int x86_used_note = DEFAULT_X86_USED_NOTE;
  #endif
  
  static const char *default_arch = DEFAULT_ARCH;
@@ -230,6 +248,9 @@ struct Broadcast_Operation
  
    /* Index of broadcasted operand.  */
    int operand;
+
+  /* Number of bytes to broadcast.  */
+  int bytes;
  };
  
  static struct Broadcast_Operation broadcast_op;
@@ -306,6 +327,7 @@ struct _i386_insn
      /* Flags for operands.  */
      unsigned int flags[MAX_OPERANDS];
  #define Operand_PCrel 1
+#define Operand_Mem   2
  
      /* Relocation type for operand */
      enum bfd_reloc_code_real reloc[MAX_OPERANDS];
@@ -328,6 +350,18 @@ struct _i386_insn
      unsigned int prefixes;
      unsigned char prefix[MAX_PREFIXES];
  
+    /* Has MMX register operands.  */
+    bfd_boolean has_regmmx;
+
+    /* Has XMM register operands.  */
+    bfd_boolean has_regxmm;
+
+    /* Has YMM register operands.  */
+    bfd_boolean has_regymm;
+
+    /* Has ZMM register operands.  */
+    bfd_boolean has_regzmm;
+
      /* RM and SIB are the modrm byte and the sib byte where the
         addressing modes of this insn are encoded.  */
      modrm_byte rm;
@@ -353,7 +387,8 @@ struct _i386_insn
        {
         dir_encoding_default = 0,
         dir_encoding_load,
-       dir_encoding_store
+       dir_encoding_store,
+       dir_encoding_swap
        } dir_encoding;
  
      /* Prefer 8bit or 32bit displacement in encoding.  */
@@ -660,6 +695,13 @@ static enum
      vex256
    } avxscalar;
  
+/* Encode VEX WIG instructions with specific vex.w.  */
+static enum
+  {
+    vexw0 = 0,
+    vexw1
+  } vexwig;
+
  /* Encode scalar EVEX LIG instructions with specific vector length.  */
  static enum
    {
@@ -856,6 +898,10 @@ static const arch_entry cpu_arch[] =
      CPU_387_FLAGS, 0 },
    { STRING_COMMA_LEN (".687"), PROCESSOR_UNKNOWN,
      CPU_687_FLAGS, 0 },
+  { STRING_COMMA_LEN (".cmov"), PROCESSOR_UNKNOWN,
+    CPU_CMOV_FLAGS, 0 },
+  { STRING_COMMA_LEN (".fxsr"), PROCESSOR_UNKNOWN,
+    CPU_FXSR_FLAGS, 0 },
    { STRING_COMMA_LEN (".mmx"), PROCESSOR_UNKNOWN,
      CPU_MMX_FLAGS, 0 },
    { STRING_COMMA_LEN (".sse"), PROCESSOR_UNKNOWN,
@@ -1042,6 +1088,8 @@ static const noarch_entry cpu_noarch[] =
    { STRING_COMMA_LEN ("no287"),  CPU_ANY_287_FLAGS },
    { STRING_COMMA_LEN ("no387"),  CPU_ANY_387_FLAGS },
    { STRING_COMMA_LEN ("no687"),  CPU_ANY_687_FLAGS },
+  { STRING_COMMA_LEN ("nocmov"),  CPU_ANY_CMOV_FLAGS },
+  { STRING_COMMA_LEN ("nofxsr"),  CPU_ANY_FXSR_FLAGS },
    { STRING_COMMA_LEN ("nommx"),  CPU_ANY_MMX_FLAGS },
    { STRING_COMMA_LEN ("nosse"),  CPU_ANY_SSE_FLAGS },
    { STRING_COMMA_LEN ("nosse2"),  CPU_ANY_SSE2_FLAGS },
@@ -2007,7 +2055,7 @@ operand_size_match (const insn_template *t)
           break;
         }
  
-      if (i.types[j].bitfield.mem && !match_mem_size (t, j, j))
+      if ((i.flags[j] & Operand_Mem) && !match_mem_size (t, j, j))
         {
           match = 0;
           break;
@@ -2023,17 +2071,26 @@ mismatch:
      }
  
    /* Check reverse.  */
-  gas_assert (i.operands == 2);
+  gas_assert (i.operands >= 2 && i.operands <= 3);
  
-  for (j = 0; j < 2; j++)
+  for (j = 0; j < i.operands; j++)
      {
-      if ((t->operand_types[j].bitfield.reg
-          || t->operand_types[j].bitfield.acc)
-         && !match_operand_size (t, j, !j))
+      unsigned int given = i.operands - j - 1;
+
+      if (t->operand_types[j].bitfield.reg
+         && !match_operand_size (t, j, given))
+       goto mismatch;
+
+      if (t->operand_types[j].bitfield.regsimd
+         && !match_simd_size (t, j, given))
+       goto mismatch;
+
+      if (t->operand_types[j].bitfield.acc
+         && (!match_operand_size (t, j, given)
+             || !match_simd_size (t, j, given)))
         goto mismatch;
  
-      if (i.types[!j].bitfield.mem
-         && !match_mem_size (t, j, !j))
+      if ((i.flags[given] & Operand_Mem) && !match_mem_size (t, j, given))
         goto mismatch;
      }
  
@@ -3314,6 +3371,7 @@ build_vex_prefix (const insn_template *t)
    unsigned int register_specifier;
    unsigned int implied_prefix;
    unsigned int vector_length;
+  unsigned int w;
  
    /* Check register specifier.  */
    if (i.vex.register_specifier)
@@ -3325,13 +3383,15 @@ build_vex_prefix (const insn_template *t)
    else
      register_specifier = 0xf;
  
-  /* Use 2-byte VEX prefix by swapping destination and source
-     operand.  */
-  if (i.vec_encoding != vex_encoding_vex3
+  /* Use 2-byte VEX prefix by swapping destination and source operand
+     if there are more than 1 register operand.  */
+  if (i.reg_operands > 1
+      && i.vec_encoding != vex_encoding_vex3
        && i.dir_encoding == dir_encoding_default
        && i.operands == i.reg_operands
+      && operand_type_equal (&i.types[0], &i.types[i.operands - 1])
        && i.tm.opcode_modifier.vexopcode == VEX0F
-      && i.tm.opcode_modifier.load
+      && (i.tm.opcode_modifier.load || i.tm.opcode_modifier.d)
        && i.rex == REX_B)
      {
        unsigned int xchg = i.operands - 1;
@@ -3352,8 +3412,11 @@ build_vex_prefix (const insn_template *t)
        i.rm.regmem = i.rm.reg;
        i.rm.reg = xchg;
  
-      /* Use the next insn.  */
-      i.tm = t[1];
+      if (i.tm.opcode_modifier.d)
+       i.tm.base_opcode ^= (i.tm.base_opcode & 0xee) != 0x6e
+                           ? Opcode_SIMD_FloatD : Opcode_SIMD_IntD;
+      else /* Use the next insn.  */
+       i.tm = t[1];
      }
  
    if (i.tm.opcode_modifier.vex == VEXScalar)
@@ -3362,12 +3425,12 @@ build_vex_prefix (const insn_template *t)
      vector_length = 1;
    else
      {
-      int op;
+      unsigned int op;
  
        /* Determine vector length from the last multi-length vector
          operand.  */
        vector_length = 0;
-      for (op = t->operands - 1; op >= 0; op--)
+      for (op = t->operands; op--;)
         if (t->operand_types[op].bitfield.xmmword
             && t->operand_types[op].bitfield.ymmword
             && i.types[op].bitfield.ymmword)
@@ -3395,10 +3458,18 @@ build_vex_prefix (const insn_template *t)
        abort ();
      }
  
+  /* Check the REX.W bit and VEXW.  */
+  if (i.tm.opcode_modifier.vexw == VEXWIG)
+    w = (vexwig == vexw1 || (i.rex & REX_W)) ? 1 : 0;
+  else if (i.tm.opcode_modifier.vexw)
+    w = i.tm.opcode_modifier.vexw == VEXW1 ? 1 : 0;
+  else
+    w = (flag_code == CODE_64BIT ? i.rex & REX_W : vexwig == vexw1) ? 1 : 0;
+
    /* Use 2-byte VEX prefix if possible.  */
-  if (i.vec_encoding != vex_encoding_vex3
+  if (w == 0
+      && i.vec_encoding != vex_encoding_vex3
        && i.tm.opcode_modifier.vexopcode == VEX0F
-      && i.tm.opcode_modifier.vexw != VEXW1
        && (i.rex & (REX_W | REX_X | REX_B)) == 0)
      {
        /* 2-byte VEX prefix.  */
@@ -3417,7 +3488,7 @@ build_vex_prefix (const insn_template *t)
    else
      {
        /* 3-byte VEX prefix.  */
-      unsigned int m, w;
+      unsigned int m;
  
        i.vex.length = 3;
  
@@ -3455,11 +3526,6 @@ build_vex_prefix (const insn_template *t)
          of RXB bits from REX.  */
        i.vex.bytes[1] = (~i.rex & 0x7) << 5 | m;
  
-      /* Check the REX.W bit.  */
-      w = (i.rex & REX_W) ? 1 : 0;
-      if (i.tm.opcode_modifier.vexw == VEXW1)
-       w = 1;
-
        i.vex.bytes[2] = (w << 7
                         | register_specifier << 3
                         | vector_length << 2
@@ -3475,6 +3541,13 @@ is_evex_encoding (const insn_template *t)
          || t->opcode_modifier.staticrounding || t->opcode_modifier.sae;
  }
  
+static INLINE bfd_boolean
+is_any_vex_encoding (const insn_template *t)
+{
+  return t->opcode_modifier.vex || t->opcode_modifier.vexopcode
+        || is_evex_encoding (t);
+}
+
  /* Build the EVEX prefix.  */
  
  static void
@@ -3578,19 +3651,13 @@ build_evex_prefix (void)
    i.vrex &= ~vrex_used;
    gas_assert (i.vrex == 0);
  
-  /* Check the REX.W bit.  */
-  w = (i.rex & REX_W) ? 1 : 0;
-  if (i.tm.opcode_modifier.vexw)
-    {
-      if (i.tm.opcode_modifier.vexw == VEXW1)
-       w = 1;
-    }
-  /* If w is not set it means we are dealing with WIG instruction.  */
-  else if (!w)
-    {
-      if (evexwig == evexw1)
-        w = 1;
-    }
+  /* Check the REX.W bit and VEXW.  */
+  if (i.tm.opcode_modifier.vexw == VEXWIG)
+    w = (evexwig == evexw1 || (i.rex & REX_W)) ? 1 : 0;
+  else if (i.tm.opcode_modifier.vexw)
+    w = i.tm.opcode_modifier.vexw == VEXW1 ? 1 : 0;
+  else
+    w = (flag_code == CODE_64BIT ? i.rex & REX_W : evexwig == evexw1) ? 1 : 0;
  
    /* Encode the U bit.  */
    implied_prefix |= 0x4;
@@ -3612,12 +3679,12 @@ build_evex_prefix (void)
        if (!i.tm.opcode_modifier.evex
           || i.tm.opcode_modifier.evex == EVEXDYN)
         {
-         int op;
+         unsigned int op;
  
           /* Determine vector length from the last multi-length vector
              operand.  */
           vec_length = 0;
-         for (op = i.operands - 1; op >= 0; op--)
+         for (op = i.operands; op--;)
             if (i.tm.operand_types[op].bitfield.xmmword
                 + i.tm.operand_types[op].bitfield.ymmword
                 + i.tm.operand_types[op].bitfield.zmmword > 1)
@@ -3639,8 +3706,7 @@ build_evex_prefix (void)
                   }
                 else if (i.broadcast && (int) op == i.broadcast->operand)
                   {
-                   switch ((i.tm.operand_types[op].bitfield.dword ? 4 : 8)
-                           * i.broadcast->type)
+                   switch (i.broadcast->bytes)
                       {
                         case 64:
                           i.tm.opcode_modifier.evex = EVEX512;
@@ -3658,7 +3724,7 @@ build_evex_prefix (void)
                   }
               }
  
-         if (op < 0)
+         if (op >= MAX_OPERANDS)
             abort ();
         }
  
@@ -3758,9 +3824,7 @@ bad_register_operand:
  
    gas_assert (i.imm_operands <= 1
               && (i.operands <= 2
-                 || ((i.tm.opcode_modifier.vex
-                      || i.tm.opcode_modifier.vexopcode
-                      || is_evex_encoding (&i.tm))
+                 || (is_any_vex_encoding (&i.tm)
                       && i.operands <= 4)));
  
    exp = &im_expressions[i.imm_operands++];
@@ -3913,8 +3977,7 @@ optimize_encoding (void)
             }
         }
      }
-  else if (optimize > 1
-          && i.reg_operands == 3
+  else if (i.reg_operands == 3
            && i.op[0].regs == i.op[1].regs
            && !i.types[2].bitfield.xmmword
            && (i.tm.opcode_modifier.vex
@@ -3922,10 +3985,10 @@ optimize_encoding (void)
                    && !i.rounding
                    && is_evex_encoding (&i.tm)
                    && (i.vec_encoding != vex_encoding_evex
+                      || cpu_arch_isa_flags.bitfield.cpuavx512vl
                        || i.tm.cpu_flags.bitfield.cpuavx512vl
                        || (i.tm.operand_types[2].bitfield.zmmword
-                          && i.types[2].bitfield.ymmword)
-                      || cpu_arch_isa_flags.bitfield.cpuavx512vl)))
+                          && i.types[2].bitfield.ymmword))))
            && ((i.tm.base_opcode == 0x55
                 || i.tm.base_opcode == 0x6655
                 || i.tm.base_opcode == 0x66df
@@ -3935,18 +3998,22 @@ optimize_encoding (void)
                 || i.tm.base_opcode == 0x66f8
                 || i.tm.base_opcode == 0x66f9
                 || i.tm.base_opcode == 0x66fa
-               || i.tm.base_opcode == 0x66fb)
+               || i.tm.base_opcode == 0x66fb
+               || i.tm.base_opcode == 0x42
+               || i.tm.base_opcode == 0x6642
+               || i.tm.base_opcode == 0x47
+               || i.tm.base_opcode == 0x6647)
                && i.tm.extension_opcode == None))
      {
-      /* Optimize: -O2:
+      /* Optimize: -O1:
            VOP, one of vandnps, vandnpd, vxorps, vxorpd, vpsubb, vpsubd,
            vpsubq and vpsubw:
              EVEX VOP %zmmM, %zmmM, %zmmN
                -> VEX VOP %xmmM, %xmmM, %xmmN (M and N < 16)
-              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)
              EVEX VOP %ymmM, %ymmM, %ymmN
                -> VEX VOP %xmmM, %xmmM, %xmmN (M and N < 16)
-              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)
              VEX VOP %ymmM, %ymmM, %ymmN
                -> VEX VOP %xmmM, %xmmM, %xmmN
            VOP, one of vpandn and vpxor:
@@ -3955,28 +4022,41 @@ optimize_encoding (void)
            VOP, one of vpandnd and vpandnq:
              EVEX VOP %zmmM, %zmmM, %zmmN
                -> VEX vpandn %xmmM, %xmmM, %xmmN (M and N < 16)
-              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)
              EVEX VOP %ymmM, %ymmM, %ymmN
                -> VEX vpandn %xmmM, %xmmM, %xmmN (M and N < 16)
-              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)
            VOP, one of vpxord and vpxorq:
              EVEX VOP %zmmM, %zmmM, %zmmN
                -> VEX vpxor %xmmM, %xmmM, %xmmN (M and N < 16)
-              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)
              EVEX VOP %ymmM, %ymmM, %ymmN
                -> VEX vpxor %xmmM, %xmmM, %xmmN (M and N < 16)
-              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)
+          VOP, one of kxord and kxorq:
+            VEX VOP %kM, %kM, %kN
+              -> VEX kxorw %kM, %kM, %kN
+          VOP, one of kandnd and kandnq:
+            VEX VOP %kM, %kM, %kN
+              -> VEX kandnw %kM, %kM, %kN
         */
        if (is_evex_encoding (&i.tm))
         {
-         if (i.vec_encoding == vex_encoding_evex)
-           i.tm.opcode_modifier.evex = EVEX128;
-         else
+         if (i.vec_encoding != vex_encoding_evex)
             {
               i.tm.opcode_modifier.vex = VEX128;
               i.tm.opcode_modifier.vexw = VEXW0;
               i.tm.opcode_modifier.evex = 0;
             }
+         else if (optimize > 1)
+           i.tm.opcode_modifier.evex = EVEX128;
+         else
+           return;
+       }
+      else if (i.tm.operand_types[0].bitfield.regmask)
+       {
+         i.tm.base_opcode &= 0xff;
+         i.tm.opcode_modifier.vexw = VEXW0;
         }
        else
         i.tm.opcode_modifier.vex = VEX128;
@@ -3988,6 +4068,56 @@ optimize_encoding (void)
             i.types[j].bitfield.ymmword = 0;
           }
      }
+  else if ((cpu_arch_flags.bitfield.cpuavx
+           || cpu_arch_isa_flags.bitfield.cpuavx)
+          && i.vec_encoding != vex_encoding_evex
+          && !i.types[0].bitfield.zmmword
+          && !i.mask
+          && is_evex_encoding (&i.tm)
+          && (i.tm.base_opcode == 0x666f
+              || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0x666f
+              || i.tm.base_opcode == 0xf36f
+              || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf36f
+              || i.tm.base_opcode == 0xf26f
+              || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf26f)
+          && i.tm.extension_opcode == None)
+    {
+      /* Optimize: -O1:
+          VOP, one of vmovdqa32, vmovdqa64, vmovdqu8, vmovdqu16,
+          vmovdqu32 and vmovdqu64:
+            EVEX VOP %xmmM, %xmmN
+              -> VEX vmovdqa|vmovdqu %xmmM, %xmmN (M and N < 16)
+            EVEX VOP %ymmM, %ymmN
+              -> VEX vmovdqa|vmovdqu %ymmM, %ymmN (M and N < 16)
+            EVEX VOP %xmmM, mem
+              -> VEX vmovdqa|vmovdqu %xmmM, mem (M < 16)
+            EVEX VOP %ymmM, mem
+              -> VEX vmovdqa|vmovdqu %ymmM, mem (M < 16)
+            EVEX VOP mem, %xmmN
+              -> VEX mvmovdqa|vmovdquem, %xmmN (N < 16)
+            EVEX VOP mem, %ymmN
+              -> VEX vmovdqa|vmovdqu mem, %ymmN (N < 16)
+       */
+      if (i.tm.base_opcode == 0xf26f)
+       i.tm.base_opcode = 0xf36f;
+      else if ((i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf26f)
+       i.tm.base_opcode = 0xf36f ^ Opcode_SIMD_IntD;
+      i.tm.opcode_modifier.vex
+       = i.types[0].bitfield.ymmword ? VEX256 : VEX128;
+      i.tm.opcode_modifier.vexw = VEXW0;
+      i.tm.opcode_modifier.evex = 0;
+      i.tm.opcode_modifier.masking = 0;
+      i.tm.opcode_modifier.disp8memshift = 0;
+      i.memshift = 0;
+      for (j = 0; j < 2; j++)
+       if (operand_type_check (i.types[j], disp)
+           && i.op[j].disps->X_op == O_constant)
+         {
+           i.types[j].bitfield.disp8
+             = fits_in_disp8 (i.op[j].disps->X_add_number);
+           break;
+         }
+    }
  }
  
  /* This is the guts of the machine-dependent assembler.  LINE points to a
@@ -4123,6 +4253,13 @@ md_assemble (char *line)
        return;
      }
  
+  /* Check for data size prefix on VEX/XOP/EVEX encoded insns.  */
+  if (i.prefix[DATA_PREFIX] && is_any_vex_encoding (&i.tm))
+    {
+      as_bad (_("data size prefix invalid with `%s'"), i.tm.name);
+      return;
+    }
+
    /* Check if HLE prefix is OK.  */
    if (i.hle_prefix && !check_hle ())
      return;
@@ -4209,8 +4346,7 @@ md_assemble (char *line)
        as_warn (_("translating to `%sp'"), i.tm.name);
      }
  
-  if (i.tm.opcode_modifier.vex || i.tm.opcode_modifier.vexopcode
-      || is_evex_encoding (&i.tm))
+  if (is_any_vex_encoding (&i.tm))
      {
        if (flag_code == CODE_16BIT)
         {
@@ -4376,10 +4512,10 @@ parse_insn (char *line, char *mnemonic)
             }
           /* If we are in 16-bit mode, do not allow addr16 or data16.
              Similarly, in 32-bit mode, do not allow addr32 or data32.  */
-         if ((current_templates->start->opcode_modifier.size16
-              || current_templates->start->opcode_modifier.size32)
+         if ((current_templates->start->opcode_modifier.size == SIZE16
+              || current_templates->start->opcode_modifier.size == SIZE32)
               && flag_code != CODE_64BIT
-             && (current_templates->start->opcode_modifier.size32
+             && ((current_templates->start->opcode_modifier.size == SIZE32)
                   ^ (flag_code == CODE_16BIT)))
             {
               as_bad (_("redundant %s prefix"),
@@ -4463,10 +4599,11 @@ parse_insn (char *line, char *mnemonic)
  
    if (!current_templates)
      {
-      /* Check if we should swap operand or force 32bit displacement in
+      /* Deprecated functionality (new code should use pseudo-prefixes instead):
+        Check if we should swap operand or force 32bit displacement in
          encoding.  */
        if (mnem_p - 2 == dot_p && dot_p[1] == 's')
-       i.dir_encoding = dir_encoding_store;
+       i.dir_encoding = dir_encoding_swap;
        else if (mnem_p - 3 == dot_p
                && dot_p[1] == 'd'
                && dot_p[2] == '8')
@@ -4486,46 +4623,50 @@ parse_insn (char *line, char *mnemonic)
    if (!current_templates)
      {
  check_suffix:
-      /* See if we can get a match by trimming off a suffix.  */
-      switch (mnem_p[-1])
+      if (mnem_p > mnemonic)
         {
-       case WORD_MNEM_SUFFIX:
-         if (intel_syntax && (intel_float_operand (mnemonic) & 2))
-           i.suffix = SHORT_MNEM_SUFFIX;
-         else
-           /* Fall through.  */
-       case BYTE_MNEM_SUFFIX:
-       case QWORD_MNEM_SUFFIX:
-         i.suffix = mnem_p[-1];
-         mnem_p[-1] = '\0';
-         current_templates = (const templates *) hash_find (op_hash,
-                                                             mnemonic);
-         break;
-       case SHORT_MNEM_SUFFIX:
-       case LONG_MNEM_SUFFIX:
-         if (!intel_syntax)
-           {
-             i.suffix = mnem_p[-1];
-             mnem_p[-1] = '\0';
-             current_templates = (const templates *) hash_find (op_hash,
-                                                                 mnemonic);
-           }
-         break;
-
-         /* Intel Syntax.  */
-       case 'd':
-         if (intel_syntax)
+         /* See if we can get a match by trimming off a suffix.  */
+         switch (mnem_p[-1])
             {
-             if (intel_float_operand (mnemonic) == 1)
+           case WORD_MNEM_SUFFIX:
+             if (intel_syntax && (intel_float_operand (mnemonic) & 2))
                 i.suffix = SHORT_MNEM_SUFFIX;
               else
-               i.suffix = LONG_MNEM_SUFFIX;
+               /* Fall through.  */
+             case BYTE_MNEM_SUFFIX:
+             case QWORD_MNEM_SUFFIX:
+               i.suffix = mnem_p[-1];
               mnem_p[-1] = '\0';
               current_templates = (const templates *) hash_find (op_hash,
-                                                                 mnemonic);
+                                                                mnemonic);
+             break;
+           case SHORT_MNEM_SUFFIX:
+           case LONG_MNEM_SUFFIX:
+             if (!intel_syntax)
+               {
+                 i.suffix = mnem_p[-1];
+                 mnem_p[-1] = '\0';
+                 current_templates = (const templates *) hash_find (op_hash,
+                                                                    mnemonic);
+               }
+             break;
+
+             /* Intel Syntax.  */
+           case 'd':
+             if (intel_syntax)
+               {
+                 if (intel_float_operand (mnemonic) == 1)
+                   i.suffix = SHORT_MNEM_SUFFIX;
+                 else
+                   i.suffix = LONG_MNEM_SUFFIX;
+                 mnem_p[-1] = '\0';
+                 current_templates = (const templates *) hash_find (op_hash,
+                                                                    mnemonic);
+               }
+             break;
             }
-         break;
         }
+
        if (!current_templates)
         {
           as_bad (_("no such instruction: `%s'"), token_start);
@@ -4673,6 +4814,13 @@ parse_operands (char *l, const char *mnemonic)
           /* Now parse operand adding info to 'i' as we go along.  */
           END_STRING_AND_SAVE (l);
  
+         if (i.mem_operands > 1)
+           {
+             as_bad (_("too many memory references for `%s'"),
+                     mnemonic);
+             return 0;
+           }
+
           if (intel_syntax)
             operand_ok =
               i386_intel_operand (token_start,
@@ -4718,14 +4866,21 @@ swap_2_operands (int xchg1, int xchg2)
  {
    union i386_op temp_op;
    i386_operand_type temp_type;
+  unsigned int temp_flags;
    enum bfd_reloc_code_real temp_reloc;
  
    temp_type = i.types[xchg2];
    i.types[xchg2] = i.types[xchg1];
    i.types[xchg1] = temp_type;
+
+  temp_flags = i.flags[xchg2];
+  i.flags[xchg2] = i.flags[xchg1];
+  i.flags[xchg1] = temp_flags;
+
    temp_op = i.op[xchg2];
    i.op[xchg2] = i.op[xchg1];
    i.op[xchg1] = temp_op;
+
    temp_reloc = i.reloc[xchg2];
    i.reloc[xchg2] = i.reloc[xchg1];
    i.reloc[xchg1] = temp_reloc;
@@ -5008,6 +5163,22 @@ optimize_disp (void)
        }
  }
  
+/* Return 1 if there is a match in broadcast bytes between operand
+   GIVEN and instruction template T.   */
+
+static INLINE int
+match_broadcast_size (const insn_template *t, unsigned int given)
+{
+  return ((t->opcode_modifier.broadcast == BYTE_BROADCAST
+          && i.types[given].bitfield.byte)
+         || (t->opcode_modifier.broadcast == WORD_BROADCAST
+             && i.types[given].bitfield.word)
+         || (t->opcode_modifier.broadcast == DWORD_BROADCAST
+             && i.types[given].bitfield.dword)
+         || (t->opcode_modifier.broadcast == QWORD_BROADCAST
+             && i.types[given].bitfield.qword));
+}
+
  /* Check if operands are valid for the instruction.  */
  
  static int
@@ -5126,23 +5297,29 @@ check_VecOperands (const insn_template *t)
        i386_operand_type type, overlap;
  
        /* Check if specified broadcast is supported in this instruction,
-        and it's applied to memory operand of DWORD or QWORD type.  */
+        and its broadcast bytes match the memory operand.  */
        op = i.broadcast->operand;
        if (!t->opcode_modifier.broadcast
-         || !i.types[op].bitfield.mem
+         || !(i.flags[op] & Operand_Mem)
           || (!i.types[op].bitfield.unspecified
-             && (t->operand_types[op].bitfield.dword
-                 ? !i.types[op].bitfield.dword
-                 : !i.types[op].bitfield.qword)))
+             && !match_broadcast_size (t, op)))
         {
         bad_broadcast:
           i.error = unsupported_broadcast;
           return 1;
         }
  
+      i.broadcast->bytes = ((1 << (t->opcode_modifier.broadcast - 1))
+                           * i.broadcast->type);
        operand_type_set (&type, 0);
-      switch ((t->operand_types[op].bitfield.dword ? 4 : 8) * i.broadcast->type)
+      switch (i.broadcast->bytes)
         {
+       case 2:
+         type.bitfield.word = 1;
+         break;
+       case 4:
+         type.bitfield.dword = 1;
+         break;
         case 8:
           type.bitfield.qword = 1;
           break;
@@ -5189,9 +5366,7 @@ check_VecOperands (const insn_template *t)
           break;
        gas_assert (op < i.operands);
        /* Check size of the memory operand.  */
-      if (t->operand_types[op].bitfield.dword
-         ? i.types[op].bitfield.dword
-         : i.types[op].bitfield.qword)
+      if (match_broadcast_size (t, op))
         {
           i.error = broadcast_needed;
           return 1;
@@ -5201,13 +5376,39 @@ check_VecOperands (const insn_template *t)
      op = MAX_OPERANDS - 1; /* Avoid uninitialized variable warning.  */
  
    /* Check if requested masking is supported.  */
-  if (i.mask
-      && (!t->opcode_modifier.masking
-         || (i.mask->zeroing
-             && t->opcode_modifier.masking == MERGING_MASKING)))
+  if (i.mask)
      {
-      i.error = unsupported_masking;
-      return 1;
+      switch (t->opcode_modifier.masking)
+       {
+       case BOTH_MASKING:
+         break;
+       case MERGING_MASKING:
+         if (i.mask->zeroing)
+           {
+       case 0:
+             i.error = unsupported_masking;
+             return 1;
+           }
+         break;
+       case DYNAMIC_MASKING:
+         /* Memory destinations allow only merging masking.  */
+         if (i.mask->zeroing && i.mem_operands)
+           {
+             /* Find memory operand.  */
+             for (op = 0; op < i.operands; op++)
+               if (i.flags[op] & Operand_Mem)
+                 break;
+             gas_assert (op < i.operands);
+             if (op == i.operands - 1)
+               {
+                 i.error = unsupported_masking;
+                 return 1;
+               }
+           }
+         break;
+       default:
+         abort ();
+       }
      }
  
    /* Check if masking is applied to dest operand.  */
@@ -5245,7 +5446,7 @@ check_VecOperands (const insn_template *t)
        && i.disp_encoding != disp_encoding_32bit)
      {
        if (i.broadcast)
-       i.memshift = t->operand_types[op].bitfield.dword ? 2 : 3;
+       i.memshift = t->opcode_modifier.broadcast - 1;
        else if (t->opcode_modifier.disp8memshift != DISP8_SHIFT_VL)
         i.memshift = t->opcode_modifier.disp8memshift;
        else
@@ -5412,6 +5613,7 @@ match_template (char mnem_suffix)
    for (t = current_templates->start; t < current_templates->end; t++)
      {
        addr_prefix_disp = -1;
+      found_reverse_match = 0;
  
        if (i.operands != t->operands)
         continue;
@@ -5584,17 +5786,42 @@ match_template (char mnem_suffix)
               && i.types[0].bitfield.acc
               && operand_type_check (i.types[1], anymem))
             continue;
-         if (!(size_match & MATCH_STRAIGHT))
-           goto check_reverse;
-         /* If we want store form, we reverse direction of operands.  */
-         if (i.dir_encoding == dir_encoding_store
-             && t->opcode_modifier.d)
-           goto check_reverse;
           /* Fall through.  */
  
         case 3:
+         if (!(size_match & MATCH_STRAIGHT))
+           goto check_reverse;
+         /* Reverse direction of operands if swapping is possible in the first
+            place (operands need to be symmetric) and
+            - the load form is requested, and the template is a store form,
+            - the store form is requested, and the template is a load form,
+            - the non-default (swapped) form is requested.  */
+         overlap1 = operand_type_and (operand_types[0], operand_types[1]);
+         if (t->opcode_modifier.d && i.reg_operands == i.operands
+             && !operand_type_all_zero (&overlap1))
+           switch (i.dir_encoding)
+             {
+             case dir_encoding_load:
+               if (operand_type_check (operand_types[i.operands - 1], anymem)
+                   || operand_types[i.operands - 1].bitfield.regmem)
+                 goto check_reverse;
+               break;
+
+             case dir_encoding_store:
+               if (!operand_type_check (operand_types[i.operands - 1], anymem)
+                   && !operand_types[i.operands - 1].bitfield.regmem)
+                 goto check_reverse;
+               break;
+
+             case dir_encoding_swap:
+               goto check_reverse;
+
+             case dir_encoding_default:
+               break;
+             }
           /* If we want store form, we skip the current load.  */
-         if (i.dir_encoding == dir_encoding_store
+         if ((i.dir_encoding == dir_encoding_store
+              || i.dir_encoding == dir_encoding_swap)
               && i.mem_operands == 0
               && t->opcode_modifier.load)
             continue;
@@ -5618,14 +5845,14 @@ check_reverse:
               if (!(size_match & MATCH_REVERSE))
                 continue;
               /* Try reversing direction of operands.  */
-             overlap0 = operand_type_and (i.types[0], operand_types[1]);
-             overlap1 = operand_type_and (i.types[1], operand_types[0]);
+             overlap0 = operand_type_and (i.types[0], operand_types[i.operands - 1]);
+             overlap1 = operand_type_and (i.types[i.operands - 1], operand_types[0]);
               if (!operand_type_match (overlap0, i.types[0])
-                 || !operand_type_match (overlap1, i.types[1])
+                 || !operand_type_match (overlap1, i.types[i.operands - 1])
                   || (check_register
                       && !operand_type_register_match (i.types[0],
-                                                      operand_types[1],
-                                                      i.types[1],
+                                                      operand_types[i.operands - 1],
+                                                      i.types[i.operands - 1],
                                                        operand_types[0])))
                 {
                   /* Does not match either direction.  */
@@ -5637,6 +5864,13 @@ check_reverse:
                 found_reverse_match = 0;
               else if (operand_types[0].bitfield.tbyte)
                 found_reverse_match = Opcode_FloatD;
+             else if (operand_types[0].bitfield.xmmword
+                      || operand_types[i.operands - 1].bitfield.xmmword
+                      || operand_types[0].bitfield.regmmx
+                      || operand_types[i.operands - 1].bitfield.regmmx
+                      || is_any_vex_encoding(t))
+               found_reverse_match = (t->base_opcode & 0xee) != 0x6e
+                                     ? Opcode_SIMD_FloatD : Opcode_SIMD_IntD;
               else
                 found_reverse_match = Opcode_D;
               if (t->opcode_modifier.floatr)
@@ -5707,10 +5941,7 @@ check_reverse:
              slip through to break.  */
         }
        if (!found_cpu_match)
-       {
-         found_reverse_match = 0;
-         continue;
-       }
+       continue;
  
        /* Check if vector and VEX operands are valid.  */
        if (check_VecOperands (t) || VEX_check_operands (t))
@@ -5834,8 +6065,8 @@ check_reverse:
  
        i.tm.base_opcode ^= found_reverse_match;
  
-      i.tm.operand_types[0] = operand_types[1];
-      i.tm.operand_types[1] = operand_types[0];
+      i.tm.operand_types[0] = operand_types[i.operands - 1];
+      i.tm.operand_types[i.operands - 1] = operand_types[0];
      }
  
    return t;
@@ -5880,11 +6111,11 @@ process_suffix (void)
  {
    /* If matched instruction specifies an explicit instruction mnemonic
       suffix, use it.  */
-  if (i.tm.opcode_modifier.size16)
+  if (i.tm.opcode_modifier.size == SIZE16)
      i.suffix = WORD_MNEM_SUFFIX;
-  else if (i.tm.opcode_modifier.size32)
+  else if (i.tm.opcode_modifier.size == SIZE32)
      i.suffix = LONG_MNEM_SUFFIX;
-  else if (i.tm.opcode_modifier.size64)
+  else if (i.tm.opcode_modifier.size == SIZE64)
      i.suffix = QWORD_MNEM_SUFFIX;
    else if (i.reg_operands)
      {
@@ -5896,27 +6127,23 @@ process_suffix (void)
              Destination register type is more significant than source
              register type.  crc32 in SSE4.2 prefers source register
              type. */
-         if (i.tm.base_opcode == 0xf20f38f1)
+         if (i.tm.base_opcode == 0xf20f38f0 && i.types[0].bitfield.reg)
             {
-             if (i.types[0].bitfield.reg && i.types[0].bitfield.word)
+             if (i.types[0].bitfield.byte)
+               i.suffix = BYTE_MNEM_SUFFIX;
+             else if (i.types[0].bitfield.word)
                 i.suffix = WORD_MNEM_SUFFIX;
-             else if (i.types[0].bitfield.reg && i.types[0].bitfield.dword)
+             else if (i.types[0].bitfield.dword)
                 i.suffix = LONG_MNEM_SUFFIX;
-             else if (i.types[0].bitfield.reg && i.types[0].bitfield.qword)
+             else if (i.types[0].bitfield.qword)
                 i.suffix = QWORD_MNEM_SUFFIX;
             }
-         else if (i.tm.base_opcode == 0xf20f38f0)
-           {
-             if (i.types[0].bitfield.reg && i.types[0].bitfield.byte)
-               i.suffix = BYTE_MNEM_SUFFIX;
-           }
  
           if (!i.suffix)
             {
               int op;
  
-             if (i.tm.base_opcode == 0xf20f38f1
-                 || i.tm.base_opcode == 0xf20f38f0)
+             if (i.tm.base_opcode == 0xf20f38f0)
                 {
                   /* We have to know the operand size for crc32.  */
                   as_bad (_("ambiguous memory operand size for `%s`"),
@@ -6108,6 +6335,9 @@ process_suffix (void)
        else if (i.suffix != QWORD_MNEM_SUFFIX
                && !i.tm.opcode_modifier.ignoresize
                && !i.tm.opcode_modifier.floatmf
+              && !i.tm.opcode_modifier.vex
+              && !i.tm.opcode_modifier.vexopcode
+              && !is_evex_encoding (&i.tm)
                && ((i.suffix == LONG_MNEM_SUFFIX) == (flag_code == CODE_16BIT)
                    || (flag_code == CODE_64BIT
                        && i.tm.opcode_modifier.jumpbyte)))
@@ -6957,6 +7187,21 @@ build_modrm_byte (void)
         {
           i.rm.reg = i.op[dest].regs->reg_num;
           i.rm.regmem = i.op[source].regs->reg_num;
+         if (i.op[dest].regs->reg_type.bitfield.regmmx
+              || i.op[source].regs->reg_type.bitfield.regmmx)
+           i.has_regmmx = TRUE;
+         else if (i.op[dest].regs->reg_type.bitfield.regsimd
+                  || i.op[source].regs->reg_type.bitfield.regsimd)
+           {
+             if (i.types[dest].bitfield.zmmword
+                 || i.types[source].bitfield.zmmword)
+               i.has_regzmm = TRUE;
+             else if (i.types[dest].bitfield.ymmword
+                      || i.types[source].bitfield.ymmword)
+               i.has_regymm = TRUE;
+             else
+               i.has_regxmm = TRUE;
+           }
           if ((i.op[dest].regs->reg_flags & RegRex) != 0)
             i.rex |= REX_R;
           if ((i.op[dest].regs->reg_flags & RegVRex) != 0)
@@ -7003,8 +7248,7 @@ build_modrm_byte (void)
  
           if (i.tm.opcode_modifier.vecsib)
             {
-             if (i.index_reg->reg_num == RegEiz
-                 || i.index_reg->reg_num == RegRiz)
+             if (i.index_reg->reg_num == RegIZ)
                 abort ();
  
               i.rm.regmem = ESCAPE_TO_TWO_BYTE_ADDRESSING;
@@ -7075,8 +7319,7 @@ build_modrm_byte (void)
               else if (!i.tm.opcode_modifier.vecsib)
                 {
                   /* !i.base_reg && i.index_reg  */
-                 if (i.index_reg->reg_num == RegEiz
-                     || i.index_reg->reg_num == RegRiz)
+                 if (i.index_reg->reg_num == RegIZ)
                     i.sib.index = NO_INDEX_REGISTER;
                   else
                     i.sib.index = i.index_reg->reg_num;
@@ -7102,8 +7345,7 @@ build_modrm_byte (void)
                 }
             }
           /* RIP addressing for 64bit mode.  */
-         else if (i.base_reg->reg_num == RegRip ||
-                  i.base_reg->reg_num == RegEip)
+         else if (i.base_reg->reg_num == RegIP)
             {
               gas_assert (!i.tm.opcode_modifier.vecsib);
               i.rm.regmem = NO_BASE_REGISTER;
@@ -7195,8 +7437,7 @@ build_modrm_byte (void)
                 }
               else if (!i.tm.opcode_modifier.vecsib)
                 {
-                 if (i.index_reg->reg_num == RegEiz
-                     || i.index_reg->reg_num == RegRiz)
+                 if (i.index_reg->reg_num == RegIZ)
                     i.sib.index = NO_INDEX_REGISTER;
                   else
                     i.sib.index = i.index_reg->reg_num;
@@ -7300,17 +7541,32 @@ build_modrm_byte (void)
           unsigned int vex_reg = ~0;
  
           for (op = 0; op < i.operands; op++)
-           if (i.types[op].bitfield.reg
-               || i.types[op].bitfield.regmmx
-               || i.types[op].bitfield.regsimd
-               || i.types[op].bitfield.regbnd
-               || i.types[op].bitfield.regmask
-               || i.types[op].bitfield.sreg2
-               || i.types[op].bitfield.sreg3
-               || i.types[op].bitfield.control
-               || i.types[op].bitfield.debug
-               || i.types[op].bitfield.test)
-             break;
+           {
+             if (i.types[op].bitfield.reg
+                 || i.types[op].bitfield.regbnd
+                 || i.types[op].bitfield.regmask
+                 || i.types[op].bitfield.sreg2
+                 || i.types[op].bitfield.sreg3
+                 || i.types[op].bitfield.control
+                 || i.types[op].bitfield.debug
+                 || i.types[op].bitfield.test)
+               break;
+             if (i.types[op].bitfield.regsimd)
+               {
+                 if (i.types[op].bitfield.zmmword)
+                   i.has_regzmm = TRUE;
+                 else if (i.types[op].bitfield.ymmword)
+                   i.has_regymm = TRUE;
+                 else
+                   i.has_regxmm = TRUE;
+                 break;
+               }
+             if (i.types[op].bitfield.regmmx)
+               {
+                 i.has_regmmx = TRUE;
+                 break;
+               }
+           }
  
           if (vex_3_sources)
             op = dest;
@@ -7690,12 +7946,218 @@ output_interseg_jump (void)
    md_number_to_chars (p + size, (valueT) i.op[0].imms->X_add_number, 2);
  }
  
+#if defined (OBJ_ELF) || defined (OBJ_MAYBE_ELF)
+void
+x86_cleanup (void)
+{
+  char *p;
+  asection *seg = now_seg;
+  subsegT subseg = now_subseg;
+  asection *sec;
+  unsigned int alignment, align_size_1;
+  unsigned int isa_1_descsz, feature_2_descsz, descsz;
+  unsigned int isa_1_descsz_raw, feature_2_descsz_raw;
+  unsigned int padding;
+
+  if (!IS_ELF || !x86_used_note)
+    return;
+
+  x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_X86;
+
+  /* The .note.gnu.property section layout:
+
+     Field     Length          Contents
+     ----      ----            ----
+     n_namsz   4               4
+     n_descsz  4               The note descriptor size
+     n_type    4               NT_GNU_PROPERTY_TYPE_0
+     n_name    4               "GNU"
+     n_desc    n_descsz        The program property array
+     ....      ....            ....
+   */
+
+  /* Create the .note.gnu.property section.  */
+  sec = subseg_new (NOTE_GNU_PROPERTY_SECTION_NAME, 0);
+  bfd_set_section_flags (stdoutput, sec,
+                        (SEC_ALLOC
+                         | SEC_LOAD
+                         | SEC_DATA
+                         | SEC_HAS_CONTENTS
+                         | SEC_READONLY));
+
+  if (get_elf_backend_data (stdoutput)->s->elfclass == ELFCLASS64)
+    {
+      align_size_1 = 7;
+      alignment = 3;
+    }
+  else
+    {
+      align_size_1 = 3;
+      alignment = 2;
+    }
+
+  bfd_set_section_alignment (stdoutput, sec, alignment);
+  elf_section_type (sec) = SHT_NOTE;
+
+  /* GNU_PROPERTY_X86_ISA_1_USED: 4-byte type + 4-byte data size
+                                 + 4-byte data  */
+  isa_1_descsz_raw = 4 + 4 + 4;
+  /* Align GNU_PROPERTY_X86_ISA_1_USED.  */
+  isa_1_descsz = (isa_1_descsz_raw + align_size_1) & ~align_size_1;
+
+  feature_2_descsz_raw = isa_1_descsz;
+  /* GNU_PROPERTY_X86_FEATURE_2_USED: 4-byte type + 4-byte data size
+                                     + 4-byte data  */
+  feature_2_descsz_raw += 4 + 4 + 4;
+  /* Align GNU_PROPERTY_X86_FEATURE_2_USED.  */
+  feature_2_descsz = ((feature_2_descsz_raw + align_size_1)
+                     & ~align_size_1);
+
+  descsz = feature_2_descsz;
+  /* Section size: n_namsz + n_descsz + n_type + n_name + n_descsz.  */
+  p = frag_more (4 + 4 + 4 + 4 + descsz);
+
+  /* Write n_namsz.  */
+  md_number_to_chars (p, (valueT) 4, 4);
+
+  /* Write n_descsz.  */
+  md_number_to_chars (p + 4, (valueT) descsz, 4);
+
+  /* Write n_type.  */
+  md_number_to_chars (p + 4 * 2, (valueT) NT_GNU_PROPERTY_TYPE_0, 4);
+
+  /* Write n_name.  */
+  memcpy (p + 4 * 3, "GNU", 4);
+
+  /* Write 4-byte type.  */
+  md_number_to_chars (p + 4 * 4,
+                     (valueT) GNU_PROPERTY_X86_ISA_1_USED, 4);
+
+  /* Write 4-byte data size.  */
+  md_number_to_chars (p + 4 * 5, (valueT) 4, 4);
+
+  /* Write 4-byte data.  */
+  md_number_to_chars (p + 4 * 6, (valueT) x86_isa_1_used, 4);
+
+  /* Zero out paddings.  */
+  padding = isa_1_descsz - isa_1_descsz_raw;
+  if (padding)
+    memset (p + 4 * 7, 0, padding);
+
+  /* Write 4-byte type.  */
+  md_number_to_chars (p + isa_1_descsz + 4 * 4,
+                     (valueT) GNU_PROPERTY_X86_FEATURE_2_USED, 4);
+
+  /* Write 4-byte data size.  */
+  md_number_to_chars (p + isa_1_descsz + 4 * 5, (valueT) 4, 4);
+
+  /* Write 4-byte data.  */
+  md_number_to_chars (p + isa_1_descsz + 4 * 6,
+                     (valueT) x86_feature_2_used, 4);
+
+  /* Zero out paddings.  */
+  padding = feature_2_descsz - feature_2_descsz_raw;
+  if (padding)
+    memset (p + isa_1_descsz + 4 * 7, 0, padding);
+
+  /* We probably can't restore the current segment, for there likely
+     isn't one yet...  */
+  if (seg && subseg)
+    subseg_set (seg, subseg);
+}
+#endif
+
  static void
  output_insn (void)
  {
    fragS *insn_start_frag;
    offsetT insn_start_off;
  
+#if defined (OBJ_ELF) || defined (OBJ_MAYBE_ELF)
+  if (IS_ELF && x86_used_note)
+    {
+      if (i.tm.cpu_flags.bitfield.cpucmov)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_CMOV;
+      if (i.tm.cpu_flags.bitfield.cpusse)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_SSE;
+      if (i.tm.cpu_flags.bitfield.cpusse2)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_SSE2;
+      if (i.tm.cpu_flags.bitfield.cpusse3)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_SSE3;
+      if (i.tm.cpu_flags.bitfield.cpussse3)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_SSSE3;
+      if (i.tm.cpu_flags.bitfield.cpusse4_1)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_SSE4_1;
+      if (i.tm.cpu_flags.bitfield.cpusse4_2)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_SSE4_2;
+      if (i.tm.cpu_flags.bitfield.cpuavx)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX;
+      if (i.tm.cpu_flags.bitfield.cpuavx2)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX2;
+      if (i.tm.cpu_flags.bitfield.cpufma)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_FMA;
+      if (i.tm.cpu_flags.bitfield.cpuavx512f)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX512F;
+      if (i.tm.cpu_flags.bitfield.cpuavx512cd)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX512CD;
+      if (i.tm.cpu_flags.bitfield.cpuavx512er)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX512ER;
+      if (i.tm.cpu_flags.bitfield.cpuavx512pf)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX512PF;
+      if (i.tm.cpu_flags.bitfield.cpuavx512vl)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX512VL;
+      if (i.tm.cpu_flags.bitfield.cpuavx512dq)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX512DQ;
+      if (i.tm.cpu_flags.bitfield.cpuavx512bw)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX512BW;
+      if (i.tm.cpu_flags.bitfield.cpuavx512_4fmaps)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX512_4FMAPS;
+      if (i.tm.cpu_flags.bitfield.cpuavx512_4vnniw)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX512_4VNNIW;
+      if (i.tm.cpu_flags.bitfield.cpuavx512_bitalg)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX512_BITALG;
+      if (i.tm.cpu_flags.bitfield.cpuavx512ifma)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX512_IFMA;
+      if (i.tm.cpu_flags.bitfield.cpuavx512vbmi)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX512_VBMI;
+      if (i.tm.cpu_flags.bitfield.cpuavx512_vbmi2)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX512_VBMI2;
+      if (i.tm.cpu_flags.bitfield.cpuavx512_vnni)
+       x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_AVX512_VNNI;
+
+      if (i.tm.cpu_flags.bitfield.cpu8087
+         || i.tm.cpu_flags.bitfield.cpu287
+         || i.tm.cpu_flags.bitfield.cpu387
+         || i.tm.cpu_flags.bitfield.cpu687
+         || i.tm.cpu_flags.bitfield.cpufisttp)
+       x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_X87;
+      /* Don't set GNU_PROPERTY_X86_FEATURE_2_MMX for prefetchtXXX nor
+        Xfence instructions.  */
+      if (i.tm.base_opcode != 0xf18
+         && i.tm.base_opcode != 0xf0d
+         && i.tm.base_opcode != 0xfae
+         && (i.has_regmmx
+             || i.tm.cpu_flags.bitfield.cpummx
+             || i.tm.cpu_flags.bitfield.cpua3dnow
+             || i.tm.cpu_flags.bitfield.cpua3dnowa))
+       x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_MMX;
+      if (i.has_regxmm)
+       x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_XMM;
+      if (i.has_regymm)
+       x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_YMM;
+      if (i.has_regzmm)
+       x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_ZMM;
+      if (i.tm.cpu_flags.bitfield.cpufxsr)
+       x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_FXSR;
+      if (i.tm.cpu_flags.bitfield.cpuxsave)
+       x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_XSAVE;
+      if (i.tm.cpu_flags.bitfield.cpuxsaveopt)
+       x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_XSAVEOPT;
+      if (i.tm.cpu_flags.bitfield.cpuxsavec)
+       x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_XSAVEC;
+    }
+#endif
+
    /* Tie dwarf2 debug info to the address at the start of the insn.
       We can't do this after the insn has been output as the current
       frag may have been closed off.  eg. by frag_var.  */
@@ -7936,7 +8398,8 @@ output_disp (fragS *insn_start_frag, offsetT insn_start_off)
               int size = disp_size (n);
               offsetT val = i.op[n].disps->X_add_number;
  
-             val = offset_in_range (val >> i.memshift, size);
+             val = offset_in_range (val >> (size == 1 ? i.memshift : 0),
+                                    size);
               p = frag_more (size);
               md_number_to_chars (p, val, size);
             }
@@ -8021,12 +8484,13 @@ output_disp (fragS *insn_start_frag, offsetT insn_start_off)
               /* Check for "call/jmp *mem", "mov mem, %reg",
                  "test %reg, mem" and "binop mem, %reg" where binop
                  is one of adc, add, and, cmp, or, sbb, sub, xor
-                instructions.  Always generate R_386_GOT32X for
-                "sym*GOT" operand in 32-bit mode.  */
-             if ((generate_relax_relocations
-                  || (!object_64bit
-                      && i.rm.mode == 0
-                      && i.rm.regmem == 5))
+                instructions without data prefix.  Always generate
+                R_386_GOT32X for "sym*GOT" operand in 32-bit mode.  */
+             if (i.prefix[DATA_PREFIX] == 0
+                 && (generate_relax_relocations
+                     || (!object_64bit
+                         && i.rm.mode == 0
+                         && i.rm.regmem == 5))
                   && (i.rm.mode == 2
                       || (i.rm.mode == 0 && i.rm.regmem == 5))
                   && ((i.operands == 1
@@ -8041,8 +8505,7 @@ output_disp (fragS *insn_start_frag, offsetT insn_start_off)
                     {
                       fixP->fx_tcbit = i.rex != 0;
                       if (i.base_reg
-                         && (i.base_reg->reg_num == RegRip
-                             || i.base_reg->reg_num == RegEip))
+                         && (i.base_reg->reg_num == RegIP))
                       fixP->fx_tcbit2 = 1;
                     }
                   else
@@ -8522,6 +8985,15 @@ x86_cons (expressionS *exp, int size)
               as_bad (_("missing or invalid expression `%s'"), save);
               *input_line_pointer = c;
             }
+         else if ((got_reloc == BFD_RELOC_386_PLT32
+                   || got_reloc == BFD_RELOC_X86_64_PLT32)
+                  && exp->X_op != O_symbol)
+           {
+             char c = *input_line_pointer;
+             *input_line_pointer = 0;
+             as_bad (_("invalid PLT expression `%s'"), save);
+             *input_line_pointer = c;
+           }
         }
      }
    else
@@ -8612,6 +9084,7 @@ check_VecOperations (char *op_string, char *op_end)
  
               broadcast_op.type = bcst_type;
               broadcast_op.operand = this_operand;
+             broadcast_op.bytes = 0;
               i.broadcast = &broadcast_op;
             }
           /* Check masking operation.  */
@@ -9142,9 +9615,7 @@ i386_addressing_mode (void)
  
           if (addr_reg)
             {
-             if (addr_reg->reg_num == RegEip
-                 || addr_reg->reg_num == RegEiz
-                 || addr_reg->reg_type.bitfield.dword)
+             if (addr_reg->reg_type.bitfield.dword)
                 addr_mode = CODE_32BIT;
               else if (flag_code != CODE_64BIT
                        && addr_reg->reg_type.bitfield.word)
@@ -9254,21 +9725,18 @@ bad_address:
         {
           /* 32-bit/64-bit checks.  */
           if ((i.base_reg
-              && (addr_mode == CODE_64BIT
-                  ? !i.base_reg->reg_type.bitfield.qword
-                  : !i.base_reg->reg_type.bitfield.dword)
-              && (i.index_reg
-                  || (i.base_reg->reg_num
-                      != (addr_mode == CODE_64BIT ? RegRip : RegEip))))
+              && ((addr_mode == CODE_64BIT
+                   ? !i.base_reg->reg_type.bitfield.qword
+                   : !i.base_reg->reg_type.bitfield.dword)
+                  || (i.index_reg && i.base_reg->reg_num == RegIP)
+                  || i.base_reg->reg_num == RegIZ))
               || (i.index_reg
                   && !i.index_reg->reg_type.bitfield.xmmword
                   && !i.index_reg->reg_type.bitfield.ymmword
                   && !i.index_reg->reg_type.bitfield.zmmword
                   && ((addr_mode == CODE_64BIT
-                      ? !(i.index_reg->reg_type.bitfield.qword
-                          || i.index_reg->reg_num == RegRiz)
-                      : !(i.index_reg->reg_type.bitfield.dword
-                          || i.index_reg->reg_num == RegEiz))
+                      ? !i.index_reg->reg_type.bitfield.qword
+                      : !i.index_reg->reg_type.bitfield.dword)
                       || !i.index_reg->reg_type.bitfield.baseindex)))
             goto bad_address;
  
@@ -9277,7 +9745,7 @@ bad_address:
               || (current_templates->start->base_opcode & ~1) == 0x0f1a)
             {
               /* They cannot use RIP-relative addressing. */
-             if (i.base_reg && i.base_reg->reg_num == RegRip)
+             if (i.base_reg && i.base_reg->reg_num == RegIP)
                 {
                   as_bad (_("`%s' cannot be used here"), operand_string);
                   return 0;
@@ -9728,7 +10196,7 @@ i386_att_operand (char *operand_string)
  
        if (i386_index_check (operand_string) == 0)
         return 0;
-      i.types[this_operand].bitfield.mem = 1;
+      i.flags[this_operand] |= Operand_Mem;
        if (i.mem_operands == 0)
         i.memop1_string = xstrdup (operand_string);
        i.mem_operands++;
@@ -10140,9 +10608,11 @@ md_apply_fix (fixS *fixP, valueT *valP, segT seg ATTRIBUTE_UNUSED)
        {
        case BFD_RELOC_386_PLT32:
        case BFD_RELOC_X86_64_PLT32:
-       /* Make the jump instruction point to the address of the operand.  At
-          runtime we merely add the offset to the actual PLT entry.  */
-       value = -4;
+       /* Make the jump instruction point to the address of the operand.
+          At runtime we merely add the offset to the actual PLT entry.
+          NB: Subtract the offset size only for jump instructions.  */
+       if (fixP->fx_pcrel)
+         value = -4;
         break;
  
        case BFD_RELOC_386_TLS_GD:
@@ -10337,15 +10807,14 @@ parse_real_register (char *reg_string, char **end_op)
      return (const reg_entry *) NULL;
  
    /* Don't allow fake index register unless allow_index_reg isn't 0. */
-  if (!allow_index_reg
-      && (r->reg_num == RegEiz || r->reg_num == RegRiz))
+  if (!allow_index_reg && r->reg_num == RegIZ)
      return (const reg_entry *) NULL;
  
    /* Upper 16 vector registers are only available with VREX in 64bit
       mode, and require EVEX encoding.  */
    if (r->reg_flags & RegVRex)
      {
-      if (!cpu_arch_flags.bitfield.cpuvrex
+      if (!cpu_arch_flags.bitfield.cpuavx512f
           || flag_code != CODE_64BIT)
         return (const reg_entry *) NULL;
  
@@ -10493,6 +10962,8 @@ const char *md_shortopts = "qnO::";
  #define OPTION_MAMD64 (OPTION_MD_BASE + 22)
  #define OPTION_MINTEL64 (OPTION_MD_BASE + 23)
  #define OPTION_MFENCE_AS_LOCK_ADD (OPTION_MD_BASE + 24)
+#define OPTION_X86_USED_NOTE (OPTION_MD_BASE + 25)
+#define OPTION_MVEXWIG (OPTION_MD_BASE + 26)
  
  struct option md_longopts[] =
  {
@@ -10504,6 +10975,7 @@ struct option md_longopts[] =
  #if defined (OBJ_ELF) || defined (OBJ_MAYBE_ELF)
    {"x32", no_argument, NULL, OPTION_X32},
    {"mshared", no_argument, NULL, OPTION_MSHARED},
+  {"mx86-used-note", required_argument, NULL, OPTION_X86_USED_NOTE},
  #endif
    {"divide", no_argument, NULL, OPTION_DIVIDE},
    {"march", required_argument, NULL, OPTION_MARCH},
@@ -10516,6 +10988,7 @@ struct option md_longopts[] =
    {"msse-check", required_argument, NULL, OPTION_MSSE_CHECK},
    {"moperand-check", required_argument, NULL, OPTION_MOPERAND_CHECK},
    {"mavxscalar", required_argument, NULL, OPTION_MAVXSCALAR},
+  {"mvexwig", required_argument, NULL, OPTION_MVEXWIG},
    {"madd-bnd-prefix", no_argument, NULL, OPTION_MADD_BND_PREFIX},
    {"mevexlig", required_argument, NULL, OPTION_MEVEXLIG},
    {"mevexwig", required_argument, NULL, OPTION_MEVEXWIG},
@@ -10571,6 +11044,17 @@ md_parse_option (int c, const char *arg)
      case OPTION_MSHARED:
        shared = 1;
        break;
+
+    case OPTION_X86_USED_NOTE:
+      if (strcasecmp (arg, "yes") == 0)
+        x86_used_note = 1;
+      else if (strcasecmp (arg, "no") == 0)
+        x86_used_note = 0;
+      else
+        as_fatal (_("invalid -mx86-used-note= option: `%s'"), arg);
+      break;
+
+
  #endif
  #if (defined (OBJ_ELF) || defined (OBJ_MAYBE_ELF) \
       || defined (TE_PE) || defined (TE_PEP) || defined (OBJ_MACH_O))
@@ -10822,6 +11306,15 @@ md_parse_option (int c, const char *arg)
         as_fatal (_("invalid -mavxscalar= option: `%s'"), arg);
        break;
  
+    case OPTION_MVEXWIG:
+      if (strcmp (arg, "0") == 0)
+       vexwig = evexw0;
+      else if (strcmp (arg, "1") == 0)
+       vexwig = evexw1;
+      else
+       as_fatal (_("invalid -mvexwig= option: `%s'"), arg);
+      break;
+
      case OPTION_MADD_BND_PREFIX:
        add_bnd_prefix = 1;
        break;
@@ -10911,7 +11404,7 @@ md_parse_option (int c, const char *arg)
         {
           optimize_for_space = 1;
           /* Turn on all encoding optimizations.  */
-         optimize = -1;
+         optimize = INT_MAX;
         }
        else
         {
@@ -11045,8 +11538,8 @@ md_show_usage (FILE *stream)
    fprintf (stream, _("\
    -s                      ignored\n"));
  #endif
-#if (defined (OBJ_ELF) || defined (OBJ_MAYBE_ELF) \
-     || defined (TE_PE) || defined (TE_PEP))
+#if defined BFD64 && (defined (OBJ_ELF) || defined (OBJ_MAYBE_ELF) \
+                     || defined (TE_PE) || defined (TE_PEP))
    fprintf (stream, _("\
    --32/--64/--x32         generate 32bit/64bit/x32 code\n"));
  #endif
@@ -11070,52 +11563,81 @@ md_show_usage (FILE *stream)
    fprintf (stream, _("\
    -msse2avx               encode SSE instructions with VEX prefix\n"));
    fprintf (stream, _("\
-  -msse-check=[none|error|warning]\n\
+  -msse-check=[none|error|warning] (default: warning)\n\
                            check SSE instructions\n"));
    fprintf (stream, _("\
-  -moperand-check=[none|error|warning]\n\
+  -moperand-check=[none|error|warning] (default: warning)\n\
                            check operand combinations for validity\n"));
    fprintf (stream, _("\
-  -mavxscalar=[128|256]   encode scalar AVX instructions with specific vector\n\
+  -mavxscalar=[128|256] (default: 128)\n\
+                          encode scalar AVX instructions with specific vector\n\
                             length\n"));
    fprintf (stream, _("\
-  -mevexlig=[128|256|512] encode scalar EVEX instructions with specific vector\n\
+  -mvexwig=[0|1] (default: 0)\n\
+                          encode VEX instructions with specific VEX.W value\n\
+                           for VEX.W bit ignored instructions\n"));
+  fprintf (stream, _("\
+  -mevexlig=[128|256|512] (default: 128)\n\
+                          encode scalar EVEX instructions with specific vector\n\
                             length\n"));
    fprintf (stream, _("\
-  -mevexwig=[0|1]         encode EVEX instructions with specific EVEX.W value\n\
+  -mevexwig=[0|1] (default: 0)\n\
+                          encode EVEX instructions with specific EVEX.W value\n\
                             for EVEX.W bit ignored instructions\n"));
    fprintf (stream, _("\
-  -mevexrcig=[rne|rd|ru|rz]\n\
+  -mevexrcig=[rne|rd|ru|rz] (default: rne)\n\
                            encode EVEX instructions with specific EVEX.RC value\n\
                             for SAE-only ignored instructions\n"));
    fprintf (stream, _("\
-  -mmnemonic=[att|intel]  use AT&T/Intel mnemonic\n"));
+  -mmnemonic=[att|intel] "));
+  if (SYSV386_COMPAT)
+    fprintf (stream, _("(default: att)\n"));
+  else
+    fprintf (stream, _("(default: intel)\n"));
+  fprintf (stream, _("\
+                          use AT&T/Intel mnemonic\n"));
    fprintf (stream, _("\
-  -msyntax=[att|intel]    use AT&T/Intel syntax\n"));
+  -msyntax=[att|intel] (default: att)\n\
+                          use AT&T/Intel syntax\n"));
    fprintf (stream, _("\
    -mindex-reg             support pseudo index registers\n"));
    fprintf (stream, _("\
    -mnaked-reg             don't require `%%' prefix for registers\n"));
    fprintf (stream, _("\
    -madd-bnd-prefix        add BND prefix for all valid branches\n"));
+#if defined (OBJ_ELF) || defined (OBJ_MAYBE_ELF)
    fprintf (stream, _("\
    -mshared                disable branch optimization for shared code\n"));
-# if defined (TE_PE) || defined (TE_PEP)
+  fprintf (stream, _("\
+  -mx86-used-note=[no|yes] "));
+  if (DEFAULT_X86_USED_NOTE)
+    fprintf (stream, _("(default: yes)\n"));
+  else
+    fprintf (stream, _("(default: no)\n"));
+  fprintf (stream, _("\
+                          generate x86 used ISA and feature properties\n"));
+#endif
+#if defined (TE_PE) || defined (TE_PEP)
    fprintf (stream, _("\
    -mbig-obj               generate big object files\n"));
  #endif
    fprintf (stream, _("\
-  -momit-lock-prefix=[no|yes]\n\
+  -momit-lock-prefix=[no|yes] (default: no)\n\
                            strip all lock prefixes\n"));
    fprintf (stream, _("\
-  -mfence-as-lock-add=[no|yes]\n\
+  -mfence-as-lock-add=[no|yes] (default: no)\n\
                            encode lfence, mfence and sfence as\n\
                             lock addl $0x0, (%%{re}sp)\n"));
    fprintf (stream, _("\
-  -mrelax-relocations=[no|yes]\n\
+  -mrelax-relocations=[no|yes] "));
+  if (DEFAULT_GENERATE_X86_RELAX_RELOCATIONS)
+    fprintf (stream, _("(default: yes)\n"));
+  else
+    fprintf (stream, _("(default: no)\n"));
+  fprintf (stream, _("\
                            generate relax relocations\n"));
    fprintf (stream, _("\
-  -mamd64                 accept only AMD64 ISA\n"));
+  -mamd64                 accept only AMD64 ISA [default]\n"));
    fprintf (stream, _("\
    -mintel64               accept only Intel64 ISA\n"));
  }