perf/x86/intel: Add Intel Skylake PMU support
[deliverable/linux.git] / arch / x86 / kernel / cpu / perf_event_intel.c
index b9826a981fb20fa45a7c1255e277e9ad1cd5d150..28fc27202d28e267796f1bca616d2b49e6616761 100644 (file)
@@ -177,6 +177,14 @@ static struct event_constraint intel_slm_event_constraints[] __read_mostly =
        EVENT_CONSTRAINT_END
 };
 
+struct event_constraint intel_skl_event_constraints[] = {
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0),      /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1),      /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2),      /* CPU_CLK_UNHALTED.REF */
+       INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2),    /* INST_RETIRED.PREC_DIST */
+       EVENT_CONSTRAINT_END
+};
+
 static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
        /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
        INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
@@ -193,6 +201,13 @@ static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
        EVENT_EXTRA_END
 };
 
+static struct extra_reg intel_skl_extra_regs[] __read_mostly = {
+       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
+       INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
+       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+       EVENT_EXTRA_END
+};
+
 EVENT_ATTR_STR(mem-loads,      mem_ld_nhm,     "event=0x0b,umask=0x10,ldlat=3");
 EVENT_ATTR_STR(mem-loads,      mem_ld_snb,     "event=0xcd,umask=0x1,ldlat=3");
 EVENT_ATTR_STR(mem-stores,     mem_st_snb,     "event=0xcd,umask=0x2");
@@ -244,6 +259,200 @@ static u64 intel_pmu_event_map(int hw_event)
        return intel_perfmon_event_map[hw_event];
 }
 
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts.
+ * - icache miss does not include decoded icache
+ */
+
+#define SKL_DEMAND_DATA_RD             BIT_ULL(0)
+#define SKL_DEMAND_RFO                 BIT_ULL(1)
+#define SKL_ANY_RESPONSE               BIT_ULL(16)
+#define SKL_SUPPLIER_NONE              BIT_ULL(17)
+#define SKL_L3_MISS_LOCAL_DRAM         BIT_ULL(26)
+#define SKL_L3_MISS_REMOTE_HOP0_DRAM   BIT_ULL(27)
+#define SKL_L3_MISS_REMOTE_HOP1_DRAM   BIT_ULL(28)
+#define SKL_L3_MISS_REMOTE_HOP2P_DRAM  BIT_ULL(29)
+#define SKL_L3_MISS                    (SKL_L3_MISS_LOCAL_DRAM| \
+                                        SKL_L3_MISS_REMOTE_HOP0_DRAM| \
+                                        SKL_L3_MISS_REMOTE_HOP1_DRAM| \
+                                        SKL_L3_MISS_REMOTE_HOP2P_DRAM)
+#define SKL_SPL_HIT                    BIT_ULL(30)
+#define SKL_SNOOP_NONE                 BIT_ULL(31)
+#define SKL_SNOOP_NOT_NEEDED           BIT_ULL(32)
+#define SKL_SNOOP_MISS                 BIT_ULL(33)
+#define SKL_SNOOP_HIT_NO_FWD           BIT_ULL(34)
+#define SKL_SNOOP_HIT_WITH_FWD         BIT_ULL(35)
+#define SKL_SNOOP_HITM                 BIT_ULL(36)
+#define SKL_SNOOP_NON_DRAM             BIT_ULL(37)
+#define SKL_ANY_SNOOP                  (SKL_SPL_HIT|SKL_SNOOP_NONE| \
+                                        SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
+                                        SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
+                                        SKL_SNOOP_HITM|SKL_SNOOP_NON_DRAM)
+#define SKL_DEMAND_READ                        SKL_DEMAND_DATA_RD
+#define SKL_SNOOP_DRAM                 (SKL_SNOOP_NONE| \
+                                        SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
+                                        SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
+                                        SKL_SNOOP_HITM|SKL_SPL_HIT)
+#define SKL_DEMAND_WRITE               SKL_DEMAND_RFO
+#define SKL_LLC_ACCESS                 SKL_ANY_RESPONSE
+#define SKL_L3_MISS_REMOTE             (SKL_L3_MISS_REMOTE_HOP0_DRAM| \
+                                        SKL_L3_MISS_REMOTE_HOP1_DRAM| \
+                                        SKL_L3_MISS_REMOTE_HOP2P_DRAM)
+
+static __initconst const u64 skl_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x81d0,  /* MEM_INST_RETIRED.ALL_LOADS */
+               [ C(RESULT_MISS)   ] = 0x151,   /* L1D.REPLACEMENT */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x82d0,  /* MEM_INST_RETIRED.ALL_STORES */
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x283,   /* ICACHE_64B.MISS */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x81d0,  /* MEM_INST_RETIRED.ALL_LOADS */
+               [ C(RESULT_MISS)   ] = 0x608,   /* DTLB_LOAD_MISSES.WALK_COMPLETED */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x82d0,  /* MEM_INST_RETIRED.ALL_STORES */
+               [ C(RESULT_MISS)   ] = 0x649,   /* DTLB_STORE_MISSES.WALK_COMPLETED */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x2085,  /* ITLB_MISSES.STLB_HIT */
+               [ C(RESULT_MISS)   ] = 0xe85,   /* ITLB_MISSES.WALK_COMPLETED */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0xc4,    /* BR_INST_RETIRED.ALL_BRANCHES */
+               [ C(RESULT_MISS)   ] = 0xc5,    /* BR_MISP_RETIRED.ALL_BRANCHES */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+};
+
+static __initconst const u64 skl_hw_cache_extra_regs
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
+                                      SKL_LLC_ACCESS|SKL_ANY_SNOOP,
+               [ C(RESULT_MISS)   ] = SKL_DEMAND_READ|
+                                      SKL_L3_MISS|SKL_ANY_SNOOP|
+                                      SKL_SUPPLIER_NONE,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
+                                      SKL_LLC_ACCESS|SKL_ANY_SNOOP,
+               [ C(RESULT_MISS)   ] = SKL_DEMAND_WRITE|
+                                      SKL_L3_MISS|SKL_ANY_SNOOP|
+                                      SKL_SUPPLIER_NONE,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
+                                      SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
+               [ C(RESULT_MISS)   ] = SKL_DEMAND_READ|
+                                      SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
+                                      SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
+               [ C(RESULT_MISS)   ] = SKL_DEMAND_WRITE|
+                                      SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+};
+
 #define SNB_DMND_DATA_RD       (1ULL << 0)
 #define SNB_DMND_RFO           (1ULL << 1)
 #define SNB_DMND_IFETCH                (1ULL << 2)
@@ -1114,7 +1323,7 @@ static struct extra_reg intel_slm_extra_regs[] __read_mostly =
 {
        /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
        INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0),
-       INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x768005ffffull, RSP_1),
+       INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x368005ffffull, RSP_1),
        EVENT_EXTRA_END
 };
 
@@ -1594,6 +1803,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 
        loops = 0;
 again:
+       intel_pmu_lbr_read();
        intel_pmu_ack_status(status);
        if (++loops > 100) {
                static bool warned = false;
@@ -1608,16 +1818,16 @@ again:
 
        inc_irq_stat(apic_perf_irqs);
 
-       intel_pmu_lbr_read();
 
        /*
-        * CondChgd bit 63 doesn't mean any overflow status. Ignore
-        * and clear the bit.
+        * Ignore a range of extra bits in status that do not indicate
+        * overflow by themselves.
         */
-       if (__test_and_clear_bit(63, (unsigned long *)&status)) {
-               if (!status)
-                       goto done;
-       }
+       status &= ~(GLOBAL_STATUS_COND_CHG |
+                   GLOBAL_STATUS_ASIF |
+                   GLOBAL_STATUS_LBRS_FROZEN);
+       if (!status)
+               goto done;
 
        /*
         * PEBS overflow sets bit 62 in the global status register
@@ -1699,18 +1909,22 @@ intel_bts_constraints(struct perf_event *event)
        return NULL;
 }
 
-static int intel_alt_er(int idx)
+static int intel_alt_er(int idx, u64 config)
 {
+       int alt_idx;
        if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
                return idx;
 
        if (idx == EXTRA_REG_RSP_0)
-               return EXTRA_REG_RSP_1;
+               alt_idx = EXTRA_REG_RSP_1;
 
        if (idx == EXTRA_REG_RSP_1)
-               return EXTRA_REG_RSP_0;
+               alt_idx = EXTRA_REG_RSP_0;
 
-       return idx;
+       if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask)
+               return idx;
+
+       return alt_idx;
 }
 
 static void intel_fixup_er(struct perf_event *event, int idx)
@@ -1799,7 +2013,7 @@ again:
                 */
                c = NULL;
        } else {
-               idx = intel_alt_er(idx);
+               idx = intel_alt_er(idx, reg->config);
                if (idx != reg->idx) {
                        raw_spin_unlock_irqrestore(&era->lock, flags);
                        goto again;
@@ -2253,6 +2467,15 @@ static void intel_pebs_aliases_snb(struct perf_event *event)
        }
 }
 
+static unsigned long intel_pmu_free_running_flags(struct perf_event *event)
+{
+       unsigned long flags = x86_pmu.free_running_flags;
+
+       if (event->attr.use_clockid)
+               flags &= ~PERF_SAMPLE_TIME;
+       return flags;
+}
+
 static int intel_pmu_hw_config(struct perf_event *event)
 {
        int ret = x86_pmu_hw_config(event);
@@ -2263,7 +2486,8 @@ static int intel_pmu_hw_config(struct perf_event *event)
        if (event->attr.precise_ip) {
                if (!event->attr.freq) {
                        event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
-                       if (!(event->attr.sample_type & ~PEBS_FREERUNNING_FLAGS))
+                       if (!(event->attr.sample_type &
+                             ~intel_pmu_free_running_flags(event)))
                                event->hw.flags |= PERF_X86_EVENT_FREERUNNING;
                }
                if (x86_pmu.pebs_aliases)
@@ -2685,6 +2909,8 @@ static __initconst const struct x86_pmu core_pmu = {
        .event_map              = intel_pmu_event_map,
        .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
        .apic                   = 1,
+       .free_running_flags     = PEBS_FREERUNNING_FLAGS,
+
        /*
         * Intel PMCs cannot be accessed sanely above 32-bit width,
         * so we install an artificial 1<<31 period regardless of
@@ -2723,6 +2949,7 @@ static __initconst const struct x86_pmu intel_pmu = {
        .event_map              = intel_pmu_event_map,
        .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
        .apic                   = 1,
+       .free_running_flags     = PEBS_FREERUNNING_FLAGS,
        /*
         * Intel PMCs cannot be accessed sanely above 32 bit width,
         * so we install an artificial 1<<31 period regardless of
@@ -3260,6 +3487,29 @@ __init int intel_pmu_init(void)
                pr_cont("Broadwell events, ");
                break;
 
+       case 78: /* 14nm Skylake Mobile */
+       case 94: /* 14nm Skylake Desktop */
+               x86_pmu.late_ack = true;
+               memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+               memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+               intel_pmu_lbr_init_skl();
+
+               x86_pmu.event_constraints = intel_skl_event_constraints;
+               x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
+               x86_pmu.extra_regs = intel_skl_extra_regs;
+               x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+               /* all extra regs are per-cpu when HT is on */
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+               x86_pmu.hw_config = hsw_hw_config;
+               x86_pmu.get_event_constraints = hsw_get_event_constraints;
+               x86_pmu.cpu_events = hsw_events_attrs;
+               WARN_ON(!x86_pmu.format_attrs);
+               x86_pmu.cpu_events = hsw_events_attrs;
+               pr_cont("Skylake events, ");
+               break;
+
        default:
                switch (x86_pmu.version) {
                case 1:
This page took 0.029391 seconds and 5 git commands to generate.