mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2024-12-28 05:24:47 +08:00
perf vendor events intel: Update jaketown TMA metrics to 4.7
Top-Down Microarchitecture Analysis (TMA) metrics simplify cycle-accounting using microarchitecture-abstracted metrics organized in one hierarchy. This update is from version 4.5 to 4.7. The update includes: - Swapped tma_info_core_ilp (becomes per SMT thread) and tma_info_pipeline_execute (per physical core). - Tuned thresholds for tma_fetch_bandwidth. The update came from: https://github.com/intel/perfmon/pull/140 https://github.com/intel/perfmon/pull/138 Running the script: https://github.com/intel/perfmon/blob/main/scripts/create_perf_json.py Signed-off-by: Ian Rogers <irogers@google.com> Reviewed-by: Kan Liang <kan.liang@linux.intel.com> Cc: Stephane Eranian <eranian@google.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Edward Baker <edward.baker@intel.com> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Samantha Alt <samantha.alt@intel.com> Cc: Weilin Wang <weilin.wang@intel.com> Signed-off-by: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/r/20240214011820.644458-25-irogers@google.com
This commit is contained in:
parent
14bc1a59f2
commit
5f9a13bee0
@ -163,7 +163,7 @@
|
||||
"MetricExpr": "tma_frontend_bound - tma_fetch_latency",
|
||||
"MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
|
||||
"MetricName": "tma_fetch_bandwidth",
|
||||
"MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35",
|
||||
"MetricThreshold": "tma_fetch_bandwidth > 0.2",
|
||||
"MetricgroupNoGroup": "TopdownL2",
|
||||
"PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_frontend_dsb_coverage, tma_lcp",
|
||||
"ScaleUnit": "100%"
|
||||
@ -193,7 +193,7 @@
|
||||
"MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
|
||||
"MetricName": "tma_fp_scalar",
|
||||
"MetricThreshold": "tma_fp_scalar > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)",
|
||||
"PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting. Related metrics: tma_fp_vector, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2",
|
||||
"PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting. Related metrics: tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2",
|
||||
"ScaleUnit": "100%"
|
||||
},
|
||||
{
|
||||
@ -202,7 +202,25 @@
|
||||
"MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
|
||||
"MetricName": "tma_fp_vector",
|
||||
"MetricThreshold": "tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)",
|
||||
"PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2",
|
||||
"PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2",
|
||||
"ScaleUnit": "100%"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors",
|
||||
"MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE) / UOPS_DISPATCHED.THREAD",
|
||||
"MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P",
|
||||
"MetricName": "tma_fp_vector_128b",
|
||||
"MetricThreshold": "tma_fp_vector_128b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))",
|
||||
"PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2",
|
||||
"ScaleUnit": "100%"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors",
|
||||
"MetricExpr": "(SIMD_FP_256.PACKED_DOUBLE + SIMD_FP_256.PACKED_SINGLE) / UOPS_DISPATCHED.THREAD",
|
||||
"MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P",
|
||||
"MetricName": "tma_fp_vector_256b",
|
||||
"MetricThreshold": "tma_fp_vector_256b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))",
|
||||
"PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2",
|
||||
"ScaleUnit": "100%"
|
||||
},
|
||||
{
|
||||
@ -222,7 +240,7 @@
|
||||
"MetricName": "tma_heavy_operations",
|
||||
"MetricThreshold": "tma_heavy_operations > 0.1",
|
||||
"MetricgroupNoGroup": "TopdownL2",
|
||||
"PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
|
||||
"PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)",
|
||||
"ScaleUnit": "100%"
|
||||
},
|
||||
{
|
||||
@ -244,7 +262,7 @@
|
||||
"MetricName": "tma_info_core_flopc"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
|
||||
"BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)",
|
||||
"MetricExpr": "UOPS_DISPATCHED.THREAD / (cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@)",
|
||||
"MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
|
||||
"MetricName": "tma_info_core_ilp"
|
||||
@ -271,21 +289,27 @@
|
||||
"MetricName": "tma_info_pipeline_retire"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
|
||||
"BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]",
|
||||
"MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time",
|
||||
"MetricGroup": "Power;Summary",
|
||||
"MetricName": "tma_info_system_average_frequency"
|
||||
"MetricName": "tma_info_system_core_frequency"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "Average CPU Utilization",
|
||||
"BriefDescription": "Average CPU Utilization (percentage)",
|
||||
"MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC",
|
||||
"MetricGroup": "HPC;Summary",
|
||||
"MetricName": "tma_info_system_cpu_utilization"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "Average number of utilized CPUs",
|
||||
"MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization",
|
||||
"MetricGroup": "Summary",
|
||||
"MetricName": "tma_info_system_cpus_utilized"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
|
||||
"MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time",
|
||||
"MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW",
|
||||
"MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW",
|
||||
"MetricName": "tma_info_system_dram_bw_use",
|
||||
"PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_mem_bandwidth"
|
||||
},
|
||||
@ -294,7 +318,7 @@
|
||||
"MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / 1e9 / duration_time",
|
||||
"MetricGroup": "Cor;Flops;HPC",
|
||||
"MetricName": "tma_info_system_gflops",
|
||||
"PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine."
|
||||
"PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]",
|
||||
@ -348,6 +372,12 @@
|
||||
"MetricGroup": "Power",
|
||||
"MetricName": "tma_info_system_turbo_utilization"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "Measured Average Uncore Frequency for the SoC [GHz]",
|
||||
"MetricExpr": "tma_info_system_socket_clks / 1e9 / duration_time",
|
||||
"MetricGroup": "SoC",
|
||||
"MetricName": "tma_info_system_uncore_frequency"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.",
|
||||
"MetricExpr": "CPU_CLK_UNHALTED.THREAD",
|
||||
@ -389,7 +419,7 @@
|
||||
{
|
||||
"BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
|
||||
"MetricExpr": "(12 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / tma_info_thread_clks",
|
||||
"MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
|
||||
"MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
|
||||
"MetricName": "tma_itlb_misses",
|
||||
"MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
|
||||
"PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: ITLB_MISSES.WALK_COMPLETED",
|
||||
@ -399,7 +429,7 @@
|
||||
"BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
|
||||
"MetricConstraint": "NO_GROUP_EVENTS_SMT",
|
||||
"MetricExpr": "MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_RETIRED.LLC_MISS) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_thread_clks",
|
||||
"MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
|
||||
"MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
|
||||
"MetricName": "tma_l3_bound",
|
||||
"MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
|
||||
"PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS",
|
||||
@ -421,7 +451,7 @@
|
||||
"MetricName": "tma_light_operations",
|
||||
"MetricThreshold": "tma_light_operations > 0.6",
|
||||
"MetricgroupNoGroup": "TopdownL2",
|
||||
"PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
|
||||
"PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST",
|
||||
"ScaleUnit": "100%"
|
||||
},
|
||||
{
|
||||
@ -436,21 +466,21 @@
|
||||
"ScaleUnit": "100%"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)",
|
||||
"BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)",
|
||||
"MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / tma_info_thread_clks",
|
||||
"MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW",
|
||||
"MetricName": "tma_mem_bandwidth",
|
||||
"MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
|
||||
"PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_info_system_dram_bw_use",
|
||||
"PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_info_system_dram_bw_use",
|
||||
"ScaleUnit": "100%"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)",
|
||||
"BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)",
|
||||
"MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth",
|
||||
"MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat",
|
||||
"MetricName": "tma_mem_latency",
|
||||
"MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
|
||||
"PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: ",
|
||||
"PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: ",
|
||||
"ScaleUnit": "100%"
|
||||
},
|
||||
{
|
||||
|
@ -2,10 +2,10 @@
|
||||
"Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
|
||||
"Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
|
||||
"BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
|
||||
"BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
|
||||
"BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
|
||||
"BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
|
||||
"Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
|
||||
"CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
|
||||
"CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
|
||||
"Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
|
||||
"Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
|
||||
"DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
|
||||
@ -23,7 +23,9 @@
|
||||
"L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
|
||||
"LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
|
||||
"MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
|
||||
"Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
|
||||
"Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
|
||||
"MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
|
||||
"MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
|
||||
"MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
|
||||
"MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
|
||||
@ -88,6 +90,7 @@
|
||||
"tma_issueTLB": "Metrics related by the issue $issueTLB",
|
||||
"tma_l1_bound_group": "Metrics contributing to tma_l1_bound category",
|
||||
"tma_light_operations_group": "Metrics contributing to tma_light_operations category",
|
||||
"tma_machine_clears_group": "Metrics contributing to tma_machine_clears category",
|
||||
"tma_mem_latency_group": "Metrics contributing to tma_mem_latency category",
|
||||
"tma_memory_bound_group": "Metrics contributing to tma_memory_bound category",
|
||||
"tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category",
|
||||
|
Loading…
Reference in New Issue
Block a user