x86: Correct EVEX to 128-bit EVEX optimization

[deliverable/binutils-gdb.git] / gas / config / tc-i386.c
diff --git a/gas/config/tc-i386.c b/gas/config/tc-i386.c

index 2bff48a778af7ce1e1baeb4bb0ca51d2e8e9fe89..3885728de7b9d25567c4cac477fe25f796476fa7 100644 (file)
--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -1,5 +1,5 @@
  /* tc-i386.c -- Assemble code for the Intel 80386
-   Copyright (C) 1989-2018 Free Software Foundation, Inc.
+   Copyright (C) 1989-2019 Free Software Foundation, Inc.
  
     This file is part of GAS, the GNU Assembler.
  
@@ -33,6 +33,17 @@
  #include "elf/x86-64.h"
  #include "opcodes/i386-init.h"
  
+#ifdef HAVE_LIMITS_H
+#include <limits.h>
+#else
+#ifdef HAVE_SYS_PARAM_H
+#include <sys/param.h>
+#endif
+#ifndef INT_MAX
+#define INT_MAX (int) (((unsigned) (-1)) >> 1)
+#endif
+#endif
+
  #ifndef REGISTER_WARNINGS
  #define REGISTER_WARNINGS 1
  #endif
@@ -684,6 +695,13 @@ static enum
      vex256
    } avxscalar;
  
+/* Encode VEX WIG instructions with specific vex.w.  */
+static enum
+  {
+    vexw0 = 0,
+    vexw1
+  } vexwig;
+
  /* Encode scalar EVEX LIG instructions with specific vector length.  */
  static enum
    {
@@ -3353,6 +3371,7 @@ build_vex_prefix (const insn_template *t)
    unsigned int register_specifier;
    unsigned int implied_prefix;
    unsigned int vector_length;
+  unsigned int w;
  
    /* Check register specifier.  */
    if (i.vex.register_specifier)
@@ -3364,9 +3383,10 @@ build_vex_prefix (const insn_template *t)
    else
      register_specifier = 0xf;
  
-  /* Use 2-byte VEX prefix by swapping destination and source
-     operand.  */
-  if (i.vec_encoding != vex_encoding_vex3
+  /* Use 2-byte VEX prefix by swapping destination and source operand
+     if there are more than 1 register operand.  */
+  if (i.reg_operands > 1
+      && i.vec_encoding != vex_encoding_vex3
        && i.dir_encoding == dir_encoding_default
        && i.operands == i.reg_operands
        && operand_type_equal (&i.types[0], &i.types[i.operands - 1])
@@ -3438,10 +3458,18 @@ build_vex_prefix (const insn_template *t)
        abort ();
      }
  
+  /* Check the REX.W bit and VEXW.  */
+  if (i.tm.opcode_modifier.vexw == VEXWIG)
+    w = (vexwig == vexw1 || (i.rex & REX_W)) ? 1 : 0;
+  else if (i.tm.opcode_modifier.vexw)
+    w = i.tm.opcode_modifier.vexw == VEXW1 ? 1 : 0;
+  else
+    w = (flag_code == CODE_64BIT ? i.rex & REX_W : vexwig == vexw1) ? 1 : 0;
+
    /* Use 2-byte VEX prefix if possible.  */
-  if (i.vec_encoding != vex_encoding_vex3
+  if (w == 0
+      && i.vec_encoding != vex_encoding_vex3
        && i.tm.opcode_modifier.vexopcode == VEX0F
-      && i.tm.opcode_modifier.vexw != VEXW1
        && (i.rex & (REX_W | REX_X | REX_B)) == 0)
      {
        /* 2-byte VEX prefix.  */
@@ -3460,7 +3488,7 @@ build_vex_prefix (const insn_template *t)
    else
      {
        /* 3-byte VEX prefix.  */
-      unsigned int m, w;
+      unsigned int m;
  
        i.vex.length = 3;
  
@@ -3498,11 +3526,6 @@ build_vex_prefix (const insn_template *t)
          of RXB bits from REX.  */
        i.vex.bytes[1] = (~i.rex & 0x7) << 5 | m;
  
-      /* Check the REX.W bit.  */
-      w = (i.rex & REX_W) ? 1 : 0;
-      if (i.tm.opcode_modifier.vexw == VEXW1)
-       w = 1;
-
        i.vex.bytes[2] = (w << 7
                         | register_specifier << 3
                         | vector_length << 2
@@ -3628,19 +3651,13 @@ build_evex_prefix (void)
    i.vrex &= ~vrex_used;
    gas_assert (i.vrex == 0);
  
-  /* Check the REX.W bit.  */
-  w = (i.rex & REX_W) ? 1 : 0;
-  if (i.tm.opcode_modifier.vexw)
-    {
-      if (i.tm.opcode_modifier.vexw == VEXW1)
-       w = 1;
-    }
-  /* If w is not set it means we are dealing with WIG instruction.  */
-  else if (!w)
-    {
-      if (evexwig == evexw1)
-        w = 1;
-    }
+  /* Check the REX.W bit and VEXW.  */
+  if (i.tm.opcode_modifier.vexw == VEXWIG)
+    w = (evexwig == evexw1 || (i.rex & REX_W)) ? 1 : 0;
+  else if (i.tm.opcode_modifier.vexw)
+    w = i.tm.opcode_modifier.vexw == VEXW1 ? 1 : 0;
+  else
+    w = (flag_code == CODE_64BIT ? i.rex & REX_W : evexwig == evexw1) ? 1 : 0;
  
    /* Encode the U bit.  */
    implied_prefix |= 0x4;
@@ -3960,8 +3977,7 @@ optimize_encoding (void)
             }
         }
      }
-  else if (optimize > 1
-          && i.reg_operands == 3
+  else if (i.reg_operands == 3
            && i.op[0].regs == i.op[1].regs
            && !i.types[2].bitfield.xmmword
            && (i.tm.opcode_modifier.vex
@@ -3969,10 +3985,10 @@ optimize_encoding (void)
                    && !i.rounding
                    && is_evex_encoding (&i.tm)
                    && (i.vec_encoding != vex_encoding_evex
+                      || cpu_arch_isa_flags.bitfield.cpuavx512vl
                        || i.tm.cpu_flags.bitfield.cpuavx512vl
                        || (i.tm.operand_types[2].bitfield.zmmword
-                          && i.types[2].bitfield.ymmword)
-                      || cpu_arch_isa_flags.bitfield.cpuavx512vl)))
+                          && i.types[2].bitfield.ymmword))))
            && ((i.tm.base_opcode == 0x55
                 || i.tm.base_opcode == 0x6655
                 || i.tm.base_opcode == 0x66df
@@ -3989,15 +4005,15 @@ optimize_encoding (void)
                 || i.tm.base_opcode == 0x6647)
                && i.tm.extension_opcode == None))
      {
-      /* Optimize: -O2:
+      /* Optimize: -O1:
            VOP, one of vandnps, vandnpd, vxorps, vxorpd, vpsubb, vpsubd,
            vpsubq and vpsubw:
              EVEX VOP %zmmM, %zmmM, %zmmN
                -> VEX VOP %xmmM, %xmmM, %xmmN (M and N < 16)
-              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)
              EVEX VOP %ymmM, %ymmM, %ymmN
                -> VEX VOP %xmmM, %xmmM, %xmmN (M and N < 16)
-              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)
              VEX VOP %ymmM, %ymmM, %ymmN
                -> VEX VOP %xmmM, %xmmM, %xmmN
            VOP, one of vpandn and vpxor:
@@ -4006,17 +4022,17 @@ optimize_encoding (void)
            VOP, one of vpandnd and vpandnq:
              EVEX VOP %zmmM, %zmmM, %zmmN
                -> VEX vpandn %xmmM, %xmmM, %xmmN (M and N < 16)
-              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)
              EVEX VOP %ymmM, %ymmM, %ymmN
                -> VEX vpandn %xmmM, %xmmM, %xmmN (M and N < 16)
-              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)
            VOP, one of vpxord and vpxorq:
              EVEX VOP %zmmM, %zmmM, %zmmN
                -> VEX vpxor %xmmM, %xmmM, %xmmN (M and N < 16)
-              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)
              EVEX VOP %ymmM, %ymmM, %ymmN
                -> VEX vpxor %xmmM, %xmmM, %xmmN (M and N < 16)
-              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+              -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)
            VOP, one of kxord and kxorq:
              VEX VOP %kM, %kM, %kN
                -> VEX kxorw %kM, %kM, %kN
@@ -4026,14 +4042,16 @@ optimize_encoding (void)
         */
        if (is_evex_encoding (&i.tm))
         {
-         if (i.vec_encoding == vex_encoding_evex)
-           i.tm.opcode_modifier.evex = EVEX128;
-         else
+         if (i.vec_encoding != vex_encoding_evex)
             {
               i.tm.opcode_modifier.vex = VEX128;
               i.tm.opcode_modifier.vexw = VEXW0;
               i.tm.opcode_modifier.evex = 0;
             }
+         else if (optimize > 1)
+           i.tm.opcode_modifier.evex = EVEX128;
+         else
+           return;
         }
        else if (i.tm.operand_types[0].bitfield.regmask)
         {
@@ -4050,6 +4068,56 @@ optimize_encoding (void)
             i.types[j].bitfield.ymmword = 0;
           }
      }
+  else if ((cpu_arch_flags.bitfield.cpuavx
+           || cpu_arch_isa_flags.bitfield.cpuavx)
+          && i.vec_encoding != vex_encoding_evex
+          && !i.types[0].bitfield.zmmword
+          && !i.mask
+          && is_evex_encoding (&i.tm)
+          && (i.tm.base_opcode == 0x666f
+              || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0x666f
+              || i.tm.base_opcode == 0xf36f
+              || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf36f
+              || i.tm.base_opcode == 0xf26f
+              || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf26f)
+          && i.tm.extension_opcode == None)
+    {
+      /* Optimize: -O1:
+          VOP, one of vmovdqa32, vmovdqa64, vmovdqu8, vmovdqu16,
+          vmovdqu32 and vmovdqu64:
+            EVEX VOP %xmmM, %xmmN
+              -> VEX vmovdqa|vmovdqu %xmmM, %xmmN (M and N < 16)
+            EVEX VOP %ymmM, %ymmN
+              -> VEX vmovdqa|vmovdqu %ymmM, %ymmN (M and N < 16)
+            EVEX VOP %xmmM, mem
+              -> VEX vmovdqa|vmovdqu %xmmM, mem (M < 16)
+            EVEX VOP %ymmM, mem
+              -> VEX vmovdqa|vmovdqu %ymmM, mem (M < 16)
+            EVEX VOP mem, %xmmN
+              -> VEX mvmovdqa|vmovdquem, %xmmN (N < 16)
+            EVEX VOP mem, %ymmN
+              -> VEX vmovdqa|vmovdqu mem, %ymmN (N < 16)
+       */
+      if (i.tm.base_opcode == 0xf26f)
+       i.tm.base_opcode = 0xf36f;
+      else if ((i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf26f)
+       i.tm.base_opcode = 0xf36f ^ Opcode_SIMD_IntD;
+      i.tm.opcode_modifier.vex
+       = i.types[0].bitfield.ymmword ? VEX256 : VEX128;
+      i.tm.opcode_modifier.vexw = VEXW0;
+      i.tm.opcode_modifier.evex = 0;
+      i.tm.opcode_modifier.masking = 0;
+      i.tm.opcode_modifier.disp8memshift = 0;
+      i.memshift = 0;
+      for (j = 0; j < 2; j++)
+       if (operand_type_check (i.types[j], disp)
+           && i.op[j].disps->X_op == O_constant)
+         {
+           i.types[j].bitfield.disp8
+             = fits_in_disp8 (i.op[j].disps->X_add_number);
+           break;
+         }
+    }
  }
  
  /* This is the guts of the machine-dependent assembler.  LINE points to a
@@ -4444,10 +4512,10 @@ parse_insn (char *line, char *mnemonic)
             }
           /* If we are in 16-bit mode, do not allow addr16 or data16.
              Similarly, in 32-bit mode, do not allow addr32 or data32.  */
-         if ((current_templates->start->opcode_modifier.size16
-              || current_templates->start->opcode_modifier.size32)
+         if ((current_templates->start->opcode_modifier.size == SIZE16
+              || current_templates->start->opcode_modifier.size == SIZE32)
               && flag_code != CODE_64BIT
-             && (current_templates->start->opcode_modifier.size32
+             && ((current_templates->start->opcode_modifier.size == SIZE32)
                   ^ (flag_code == CODE_16BIT)))
             {
               as_bad (_("redundant %s prefix"),
@@ -4555,46 +4623,50 @@ parse_insn (char *line, char *mnemonic)
    if (!current_templates)
      {
  check_suffix:
-      /* See if we can get a match by trimming off a suffix.  */
-      switch (mnem_p[-1])
+      if (mnem_p > mnemonic)
         {
-       case WORD_MNEM_SUFFIX:
-         if (intel_syntax && (intel_float_operand (mnemonic) & 2))
-           i.suffix = SHORT_MNEM_SUFFIX;
-         else
-           /* Fall through.  */
-       case BYTE_MNEM_SUFFIX:
-       case QWORD_MNEM_SUFFIX:
-         i.suffix = mnem_p[-1];
-         mnem_p[-1] = '\0';
-         current_templates = (const templates *) hash_find (op_hash,
-                                                             mnemonic);
-         break;
-       case SHORT_MNEM_SUFFIX:
-       case LONG_MNEM_SUFFIX:
-         if (!intel_syntax)
-           {
-             i.suffix = mnem_p[-1];
-             mnem_p[-1] = '\0';
-             current_templates = (const templates *) hash_find (op_hash,
-                                                                 mnemonic);
-           }
-         break;
-
-         /* Intel Syntax.  */
-       case 'd':
-         if (intel_syntax)
+         /* See if we can get a match by trimming off a suffix.  */
+         switch (mnem_p[-1])
             {
-             if (intel_float_operand (mnemonic) == 1)
+           case WORD_MNEM_SUFFIX:
+             if (intel_syntax && (intel_float_operand (mnemonic) & 2))
                 i.suffix = SHORT_MNEM_SUFFIX;
               else
-               i.suffix = LONG_MNEM_SUFFIX;
+               /* Fall through.  */
+             case BYTE_MNEM_SUFFIX:
+             case QWORD_MNEM_SUFFIX:
+               i.suffix = mnem_p[-1];
               mnem_p[-1] = '\0';
               current_templates = (const templates *) hash_find (op_hash,
-                                                                 mnemonic);
+                                                                mnemonic);
+             break;
+           case SHORT_MNEM_SUFFIX:
+           case LONG_MNEM_SUFFIX:
+             if (!intel_syntax)
+               {
+                 i.suffix = mnem_p[-1];
+                 mnem_p[-1] = '\0';
+                 current_templates = (const templates *) hash_find (op_hash,
+                                                                    mnemonic);
+               }
+             break;
+
+             /* Intel Syntax.  */
+           case 'd':
+             if (intel_syntax)
+               {
+                 if (intel_float_operand (mnemonic) == 1)
+                   i.suffix = SHORT_MNEM_SUFFIX;
+                 else
+                   i.suffix = LONG_MNEM_SUFFIX;
+                 mnem_p[-1] = '\0';
+                 current_templates = (const templates *) hash_find (op_hash,
+                                                                    mnemonic);
+               }
+             break;
             }
-         break;
         }
+
        if (!current_templates)
         {
           as_bad (_("no such instruction: `%s'"), token_start);
@@ -6039,11 +6111,11 @@ process_suffix (void)
  {
    /* If matched instruction specifies an explicit instruction mnemonic
       suffix, use it.  */
-  if (i.tm.opcode_modifier.size16)
+  if (i.tm.opcode_modifier.size == SIZE16)
      i.suffix = WORD_MNEM_SUFFIX;
-  else if (i.tm.opcode_modifier.size32)
+  else if (i.tm.opcode_modifier.size == SIZE32)
      i.suffix = LONG_MNEM_SUFFIX;
-  else if (i.tm.opcode_modifier.size64)
+  else if (i.tm.opcode_modifier.size == SIZE64)
      i.suffix = QWORD_MNEM_SUFFIX;
    else if (i.reg_operands)
      {
@@ -6055,27 +6127,23 @@ process_suffix (void)
              Destination register type is more significant than source
              register type.  crc32 in SSE4.2 prefers source register
              type. */
-         if (i.tm.base_opcode == 0xf20f38f1)
+         if (i.tm.base_opcode == 0xf20f38f0 && i.types[0].bitfield.reg)
             {
-             if (i.types[0].bitfield.reg && i.types[0].bitfield.word)
+             if (i.types[0].bitfield.byte)
+               i.suffix = BYTE_MNEM_SUFFIX;
+             else if (i.types[0].bitfield.word)
                 i.suffix = WORD_MNEM_SUFFIX;
-             else if (i.types[0].bitfield.reg && i.types[0].bitfield.dword)
+             else if (i.types[0].bitfield.dword)
                 i.suffix = LONG_MNEM_SUFFIX;
-             else if (i.types[0].bitfield.reg && i.types[0].bitfield.qword)
+             else if (i.types[0].bitfield.qword)
                 i.suffix = QWORD_MNEM_SUFFIX;
             }
-         else if (i.tm.base_opcode == 0xf20f38f0)
-           {
-             if (i.types[0].bitfield.reg && i.types[0].bitfield.byte)
-               i.suffix = BYTE_MNEM_SUFFIX;
-           }
  
           if (!i.suffix)
             {
               int op;
  
-             if (i.tm.base_opcode == 0xf20f38f1
-                 || i.tm.base_opcode == 0xf20f38f0)
+             if (i.tm.base_opcode == 0xf20f38f0)
                 {
                   /* We have to know the operand size for crc32.  */
                   as_bad (_("ambiguous memory operand size for `%s`"),
@@ -7894,7 +7962,6 @@ x86_cleanup (void)
    if (!IS_ELF || !x86_used_note)
      return;
  
-  x86_isa_1_used |= GNU_PROPERTY_X86_UINT32_VALID;
    x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_X86;
  
    /* The .note.gnu.property section layout:
@@ -8417,12 +8484,13 @@ output_disp (fragS *insn_start_frag, offsetT insn_start_off)
               /* Check for "call/jmp *mem", "mov mem, %reg",
                  "test %reg, mem" and "binop mem, %reg" where binop
                  is one of adc, add, and, cmp, or, sbb, sub, xor
-                instructions.  Always generate R_386_GOT32X for
-                "sym*GOT" operand in 32-bit mode.  */
-             if ((generate_relax_relocations
-                  || (!object_64bit
-                      && i.rm.mode == 0
-                      && i.rm.regmem == 5))
+                instructions without data prefix.  Always generate
+                R_386_GOT32X for "sym*GOT" operand in 32-bit mode.  */
+             if (i.prefix[DATA_PREFIX] == 0
+                 && (generate_relax_relocations
+                     || (!object_64bit
+                         && i.rm.mode == 0
+                         && i.rm.regmem == 5))
                   && (i.rm.mode == 2
                       || (i.rm.mode == 0 && i.rm.regmem == 5))
                   && ((i.operands == 1
@@ -8917,6 +8985,15 @@ x86_cons (expressionS *exp, int size)
               as_bad (_("missing or invalid expression `%s'"), save);
               *input_line_pointer = c;
             }
+         else if ((got_reloc == BFD_RELOC_386_PLT32
+                   || got_reloc == BFD_RELOC_X86_64_PLT32)
+                  && exp->X_op != O_symbol)
+           {
+             char c = *input_line_pointer;
+             *input_line_pointer = 0;
+             as_bad (_("invalid PLT expression `%s'"), save);
+             *input_line_pointer = c;
+           }
         }
      }
    else
@@ -10531,9 +10608,11 @@ md_apply_fix (fixS *fixP, valueT *valP, segT seg ATTRIBUTE_UNUSED)
        {
        case BFD_RELOC_386_PLT32:
        case BFD_RELOC_X86_64_PLT32:
-       /* Make the jump instruction point to the address of the operand.  At
-          runtime we merely add the offset to the actual PLT entry.  */
-       value = -4;
+       /* Make the jump instruction point to the address of the operand.
+          At runtime we merely add the offset to the actual PLT entry.
+          NB: Subtract the offset size only for jump instructions.  */
+       if (fixP->fx_pcrel)
+         value = -4;
         break;
  
        case BFD_RELOC_386_TLS_GD:
@@ -10884,6 +10963,7 @@ const char *md_shortopts = "qnO::";
  #define OPTION_MINTEL64 (OPTION_MD_BASE + 23)
  #define OPTION_MFENCE_AS_LOCK_ADD (OPTION_MD_BASE + 24)
  #define OPTION_X86_USED_NOTE (OPTION_MD_BASE + 25)
+#define OPTION_MVEXWIG (OPTION_MD_BASE + 26)
  
  struct option md_longopts[] =
  {
@@ -10908,6 +10988,7 @@ struct option md_longopts[] =
    {"msse-check", required_argument, NULL, OPTION_MSSE_CHECK},
    {"moperand-check", required_argument, NULL, OPTION_MOPERAND_CHECK},
    {"mavxscalar", required_argument, NULL, OPTION_MAVXSCALAR},
+  {"mvexwig", required_argument, NULL, OPTION_MVEXWIG},
    {"madd-bnd-prefix", no_argument, NULL, OPTION_MADD_BND_PREFIX},
    {"mevexlig", required_argument, NULL, OPTION_MEVEXLIG},
    {"mevexwig", required_argument, NULL, OPTION_MEVEXWIG},
@@ -11225,6 +11306,15 @@ md_parse_option (int c, const char *arg)
         as_fatal (_("invalid -mavxscalar= option: `%s'"), arg);
        break;
  
+    case OPTION_MVEXWIG:
+      if (strcmp (arg, "0") == 0)
+       vexwig = evexw0;
+      else if (strcmp (arg, "1") == 0)
+       vexwig = evexw1;
+      else
+       as_fatal (_("invalid -mvexwig= option: `%s'"), arg);
+      break;
+
      case OPTION_MADD_BND_PREFIX:
        add_bnd_prefix = 1;
        break;
@@ -11314,7 +11404,7 @@ md_parse_option (int c, const char *arg)
         {
           optimize_for_space = 1;
           /* Turn on all encoding optimizations.  */
-         optimize = -1;
+         optimize = INT_MAX;
         }
        else
         {
@@ -11483,6 +11573,10 @@ md_show_usage (FILE *stream)
                            encode scalar AVX instructions with specific vector\n\
                             length\n"));
    fprintf (stream, _("\
+  -mvexwig=[0|1] (default: 0)\n\
+                          encode VEX instructions with specific VEX.W value\n\
+                           for VEX.W bit ignored instructions\n"));
+  fprintf (stream, _("\
    -mevexlig=[128|256|512] (default: 128)\n\
                            encode scalar EVEX instructions with specific vector\n\
                             length\n"));