+ else if (i.vec_encoding != vex_encoding_evex
+ && !i.types[0].bitfield.zmmword
+ && !i.types[1].bitfield.zmmword
+ && !i.mask
+ && !i.broadcast
+ && is_evex_encoding (&i.tm)
+ && ((i.tm.base_opcode & ~Opcode_SIMD_IntD) == 0x666f
+ || (i.tm.base_opcode & ~Opcode_SIMD_IntD) == 0xf36f
+ || (i.tm.base_opcode & ~Opcode_SIMD_IntD) == 0xf26f
+ || (i.tm.base_opcode & ~4) == 0x66db
+ || (i.tm.base_opcode & ~4) == 0x66eb)
+ && i.tm.extension_opcode == None)
+ {
+ /* Optimize: -O1:
+ VOP, one of vmovdqa32, vmovdqa64, vmovdqu8, vmovdqu16,
+ vmovdqu32 and vmovdqu64:
+ EVEX VOP %xmmM, %xmmN
+ -> VEX vmovdqa|vmovdqu %xmmM, %xmmN (M and N < 16)
+ EVEX VOP %ymmM, %ymmN
+ -> VEX vmovdqa|vmovdqu %ymmM, %ymmN (M and N < 16)
+ EVEX VOP %xmmM, mem
+ -> VEX vmovdqa|vmovdqu %xmmM, mem (M < 16)
+ EVEX VOP %ymmM, mem
+ -> VEX vmovdqa|vmovdqu %ymmM, mem (M < 16)
+ EVEX VOP mem, %xmmN
+ -> VEX mvmovdqa|vmovdquem, %xmmN (N < 16)
+ EVEX VOP mem, %ymmN
+ -> VEX vmovdqa|vmovdqu mem, %ymmN (N < 16)
+ VOP, one of vpand, vpandn, vpor, vpxor:
+ EVEX VOP{d,q} %xmmL, %xmmM, %xmmN
+ -> VEX VOP %xmmL, %xmmM, %xmmN (L, M, and N < 16)
+ EVEX VOP{d,q} %ymmL, %ymmM, %ymmN
+ -> VEX VOP %ymmL, %ymmM, %ymmN (L, M, and N < 16)
+ EVEX VOP{d,q} mem, %xmmM, %xmmN
+ -> VEX VOP mem, %xmmM, %xmmN (M and N < 16)
+ EVEX VOP{d,q} mem, %ymmM, %ymmN
+ -> VEX VOP mem, %ymmM, %ymmN (M and N < 16)
+ */
+ for (j = 0; j < i.operands; j++)
+ if (operand_type_check (i.types[j], disp)
+ && i.op[j].disps->X_op == O_constant)
+ {
+ /* Since the VEX prefix has 2 or 3 bytes, the EVEX prefix
+ has 4 bytes, EVEX Disp8 has 1 byte and VEX Disp32 has 4
+ bytes, we choose EVEX Disp8 over VEX Disp32. */
+ int evex_disp8, vex_disp8;
+ unsigned int memshift = i.memshift;
+ offsetT n = i.op[j].disps->X_add_number;
+
+ evex_disp8 = fits_in_disp8 (n);
+ i.memshift = 0;
+ vex_disp8 = fits_in_disp8 (n);
+ if (evex_disp8 != vex_disp8)
+ {
+ i.memshift = memshift;
+ return;
+ }
+
+ i.types[j].bitfield.disp8 = vex_disp8;
+ break;
+ }
+ if ((i.tm.base_opcode & ~Opcode_SIMD_IntD) == 0xf26f)
+ i.tm.base_opcode ^= 0xf36f ^ 0xf26f;
+ i.tm.opcode_modifier.vex
+ = i.types[0].bitfield.ymmword ? VEX256 : VEX128;
+ i.tm.opcode_modifier.vexw = VEXW0;
+ /* VPAND, VPOR, and VPXOR are commutative. */
+ if (i.reg_operands == 3 && i.tm.base_opcode != 0x66df)
+ i.tm.opcode_modifier.commutative = 1;
+ i.tm.opcode_modifier.evex = 0;
+ i.tm.opcode_modifier.masking = 0;
+ i.tm.opcode_modifier.broadcast = 0;
+ i.tm.opcode_modifier.disp8memshift = 0;
+ i.memshift = 0;
+ if (j < i.operands)
+ i.types[j].bitfield.disp8
+ = fits_in_disp8 (i.op[j].disps->X_add_number);
+ }
+}
+
+/* Return non-zero for load instruction. */
+
+static int
+load_insn_p (void)
+{
+ unsigned int dest;
+ int any_vex_p = is_any_vex_encoding (&i.tm);
+ unsigned int base_opcode = i.tm.base_opcode | 1;
+
+ if (!any_vex_p)
+ {
+ /* lea */
+ if (i.tm.base_opcode == 0x8d)
+ return 0;
+
+ /* pop */
+ if ((i.tm.base_opcode & ~7) == 0x58
+ || (i.tm.base_opcode == 0x8f && i.tm.extension_opcode == 0))
+ return 1;
+
+ /* movs, cmps, lods, scas. */
+ if ((i.tm.base_opcode | 0xb) == 0xaf)
+ return 1;
+
+ /* outs */
+ if (base_opcode == 0x6f)
+ return 1;
+ }
+
+ /* No memory operand. */
+ if (!i.mem_operands)
+ return 0;
+
+ if (any_vex_p)
+ {
+ /* vldmxcsr. */
+ if (i.tm.base_opcode == 0xae
+ && i.tm.opcode_modifier.vex
+ && i.tm.opcode_modifier.vexopcode == VEX0F
+ && i.tm.extension_opcode == 2)
+ return 1;
+ }
+ else
+ {
+ /* test, not, neg, mul, imul, div, idiv. */
+ if ((i.tm.base_opcode == 0xf6 || i.tm.base_opcode == 0xf7)
+ && i.tm.extension_opcode != 1)
+ return 1;
+
+ /* inc, dec. */
+ if (base_opcode == 0xff && i.tm.extension_opcode <= 1)
+ return 1;
+
+ /* add, or, adc, sbb, and, sub, xor, cmp. */
+ if (i.tm.base_opcode >= 0x80 && i.tm.base_opcode <= 0x83)
+ return 1;
+
+ /* bt, bts, btr, btc. */
+ if (i.tm.base_opcode == 0xfba
+ && (i.tm.extension_opcode >= 4 && i.tm.extension_opcode <= 7))
+ return 1;
+
+ /* rol, ror, rcl, rcr, shl/sal, shr, sar. */
+ if ((base_opcode == 0xc1
+ || (i.tm.base_opcode >= 0xd0 && i.tm.base_opcode <= 0xd3))
+ && i.tm.extension_opcode != 6)
+ return 1;
+
+ /* cmpxchg8b, cmpxchg16b, xrstors. */
+ if (i.tm.base_opcode == 0xfc7
+ && (i.tm.extension_opcode == 1 || i.tm.extension_opcode == 3))
+ return 1;
+
+ /* fxrstor, ldmxcsr, xrstor. */
+ if (i.tm.base_opcode == 0xfae
+ && (i.tm.extension_opcode == 1
+ || i.tm.extension_opcode == 2
+ || i.tm.extension_opcode == 5))
+ return 1;
+
+ /* lgdt, lidt, lmsw. */
+ if (i.tm.base_opcode == 0xf01
+ && (i.tm.extension_opcode == 2
+ || i.tm.extension_opcode == 3
+ || i.tm.extension_opcode == 6))
+ return 1;
+
+ /* vmptrld */
+ if (i.tm.base_opcode == 0xfc7
+ && i.tm.extension_opcode == 6)
+ return 1;
+
+ /* Check for x87 instructions. */
+ if (i.tm.base_opcode >= 0xd8 && i.tm.base_opcode <= 0xdf)
+ {
+ /* Skip fst, fstp, fstenv, fstcw. */
+ if (i.tm.base_opcode == 0xd9
+ && (i.tm.extension_opcode == 2
+ || i.tm.extension_opcode == 3
+ || i.tm.extension_opcode == 6
+ || i.tm.extension_opcode == 7))
+ return 0;
+
+ /* Skip fisttp, fist, fistp, fstp. */
+ if (i.tm.base_opcode == 0xdb
+ && (i.tm.extension_opcode == 1
+ || i.tm.extension_opcode == 2
+ || i.tm.extension_opcode == 3
+ || i.tm.extension_opcode == 7))
+ return 0;
+
+ /* Skip fisttp, fst, fstp, fsave, fstsw. */
+ if (i.tm.base_opcode == 0xdd
+ && (i.tm.extension_opcode == 1
+ || i.tm.extension_opcode == 2
+ || i.tm.extension_opcode == 3
+ || i.tm.extension_opcode == 6
+ || i.tm.extension_opcode == 7))
+ return 0;
+
+ /* Skip fisttp, fist, fistp, fbstp, fistp. */
+ if (i.tm.base_opcode == 0xdf
+ && (i.tm.extension_opcode == 1
+ || i.tm.extension_opcode == 2
+ || i.tm.extension_opcode == 3
+ || i.tm.extension_opcode == 6
+ || i.tm.extension_opcode == 7))
+ return 0;
+
+ return 1;
+ }
+ }
+
+ dest = i.operands - 1;
+
+ /* Check fake imm8 operand and 3 source operands. */
+ if ((i.tm.opcode_modifier.immext
+ || i.tm.opcode_modifier.vexsources == VEX3SOURCES)
+ && i.types[dest].bitfield.imm8)
+ dest--;
+
+ /* add, or, adc, sbb, and, sub, xor, cmp, test, xchg, xadd */
+ if (!any_vex_p
+ && (base_opcode == 0x1
+ || base_opcode == 0x9
+ || base_opcode == 0x11
+ || base_opcode == 0x19
+ || base_opcode == 0x21
+ || base_opcode == 0x29
+ || base_opcode == 0x31
+ || base_opcode == 0x39
+ || (i.tm.base_opcode >= 0x84 && i.tm.base_opcode <= 0x87)
+ || base_opcode == 0xfc1))
+ return 1;
+
+ /* Check for load instruction. */
+ return (i.types[dest].bitfield.class != ClassNone
+ || i.types[dest].bitfield.instance == Accum);
+}
+
+/* Output lfence, 0xfaee8, after instruction. */
+
+static void
+insert_lfence_after (void)
+{
+ if (lfence_after_load && load_insn_p ())
+ {
+ char *p = frag_more (3);
+ *p++ = 0xf;
+ *p++ = 0xae;
+ *p = 0xe8;
+ }
+}
+
+/* Output lfence, 0xfaee8, before instruction. */
+
+static void
+insert_lfence_before (void)
+{
+ char *p;
+
+ if (is_any_vex_encoding (&i.tm))
+ return;
+
+ if (i.tm.base_opcode == 0xff
+ && (i.tm.extension_opcode == 2 || i.tm.extension_opcode == 4))
+ {
+ /* Insert lfence before indirect branch if needed. */
+
+ if (lfence_before_indirect_branch == lfence_branch_none)
+ return;
+
+ if (i.operands != 1)
+ abort ();
+
+ if (i.reg_operands == 1)
+ {
+ /* Indirect branch via register. Don't insert lfence with
+ -mlfence-after-load=yes. */
+ if (lfence_after_load
+ || lfence_before_indirect_branch == lfence_branch_memory)
+ return;
+ }
+ else if (i.mem_operands == 1
+ && lfence_before_indirect_branch != lfence_branch_register)
+ {
+ as_warn (_("indirect `%s` with memory operand should be avoided"),
+ i.tm.name);
+ return;
+ }
+ else
+ return;
+
+ if (last_insn.kind != last_insn_other
+ && last_insn.seg == now_seg)
+ {
+ as_warn_where (last_insn.file, last_insn.line,
+ _("`%s` skips -mlfence-before-indirect-branch on `%s`"),
+ last_insn.name, i.tm.name);
+ return;
+ }
+
+ p = frag_more (3);
+ *p++ = 0xf;
+ *p++ = 0xae;
+ *p = 0xe8;
+ return;
+ }
+
+ /* Output or/not and lfence before ret. */
+ if (lfence_before_ret != lfence_before_ret_none
+ && (i.tm.base_opcode == 0xc2
+ || i.tm.base_opcode == 0xc3
+ || i.tm.base_opcode == 0xca
+ || i.tm.base_opcode == 0xcb))
+ {
+ if (last_insn.kind != last_insn_other
+ && last_insn.seg == now_seg)
+ {
+ as_warn_where (last_insn.file, last_insn.line,
+ _("`%s` skips -mlfence-before-ret on `%s`"),
+ last_insn.name, i.tm.name);
+ return;
+ }
+ if (lfence_before_ret == lfence_before_ret_or)
+ {
+ /* orl: 0x830c2400. */
+ p = frag_more ((flag_code == CODE_64BIT ? 1 : 0) + 4 + 3);
+ if (flag_code == CODE_64BIT)
+ *p++ = 0x48;
+ *p++ = 0x83;
+ *p++ = 0xc;
+ *p++ = 0x24;
+ *p++ = 0x0;
+ }
+ else
+ {
+ p = frag_more ((flag_code == CODE_64BIT ? 2 : 0) + 6 + 3);
+ /* notl: 0xf71424. */
+ if (flag_code == CODE_64BIT)
+ *p++ = 0x48;
+ *p++ = 0xf7;
+ *p++ = 0x14;
+ *p++ = 0x24;
+ /* notl: 0xf71424. */
+ if (flag_code == CODE_64BIT)
+ *p++ = 0x48;
+ *p++ = 0xf7;
+ *p++ = 0x14;
+ *p++ = 0x24;
+ }
+ *p++ = 0xf;
+ *p++ = 0xae;
+ *p = 0xe8;
+ }