Loop Id: 161 | Module: exec | Source: generate_chunk.cpp:75-77 [...] | Coverage: 0.04% |
---|
Loop Id: 161 | Module: exec | Source: generate_chunk.cpp:75-77 [...] | Coverage: 0.04% |
---|
0x242920 VEXTRACTI32X4 $0x3,%ZMM0,%XMM9 |
0x242927 VPEXTRQ $0x1,%XMM6,%RSI |
0x24292d VMOVQ %XMM1,%RDI |
0x242932 KXNORW %K0,%K0,%K1 |
0x242936 VPEXTRQ $0x1,%XMM9,%RAX |
0x24293c CQTO |
0x24293e IDIV %RSI |
0x242941 VMOVQ %XMM6,%RSI |
0x242946 VMOVQ %RAX,%XMM10 |
0x24294b VMOVQ %XMM9,%RAX |
0x242950 CQTO |
0x242952 IDIV %RSI |
0x242955 VPEXTRQ $0x1,%XMM7,%RSI |
0x24295b VMOVQ %RAX,%XMM9 |
0x242960 VPUNPCKLQDQ %XMM10,%XMM9,%XMM9 |
0x242965 VEXTRACTI32X4 $0x2,%ZMM0,%XMM10 |
0x24296c VPEXTRQ $0x1,%XMM10,%RAX |
0x242972 CQTO |
0x242974 IDIV %RSI |
0x242977 VMOVQ %XMM7,%RSI |
0x24297c VMOVQ %RAX,%XMM11 |
0x242981 VMOVQ %XMM10,%RAX |
0x242986 CQTO |
0x242988 IDIV %RSI |
0x24298b VMOVQ %RAX,%XMM10 |
0x242990 VPUNPCKLQDQ %XMM11,%XMM10,%XMM10 |
0x242995 VEXTRACTI128 $0x1,%YMM1,%XMM11 |
0x24299b VINSERTI128 $0x1,%XMM9,%YMM10,%YMM9 |
0x2429a1 VEXTRACTI128 $0x1,%YMM0,%XMM10 |
0x2429a7 VPEXTRQ $0x1,%XMM11,%RSI |
0x2429ad VPEXTRQ $0x1,%XMM10,%RAX |
0x2429b3 CQTO |
0x2429b5 IDIV %RSI |
0x2429b8 VMOVQ %XMM11,%RSI |
0x2429bd VMOVQ %RAX,%XMM12 |
0x2429c2 VMOVQ %XMM10,%RAX |
0x2429c7 CQTO |
0x2429c9 IDIV %RSI |
0x2429cc VPEXTRQ $0x1,%XMM1,%RSI |
0x2429d2 VMOVQ %RAX,%XMM10 |
0x2429d7 VPEXTRQ $0x1,%XMM0,%RAX |
0x2429dd CQTO |
0x2429df VPUNPCKLQDQ %XMM12,%XMM10,%XMM10 |
0x2429e4 IDIV %RSI |
0x2429e7 MOV %RAX,%RSI |
0x2429ea VMOVQ %XMM0,%RAX |
0x2429ef VMOVQ %RSI,%XMM11 |
0x2429f4 CQTO |
0x2429f6 IDIV %RDI |
0x2429f9 MOV -0x48(%RBP),%RDX [7] |
0x2429fd ADD $-0x8,%R14 |
0x242a01 VMOVQ %RAX,%XMM12 |
0x242a06 VPUNPCKLQDQ %XMM11,%XMM12,%XMM11 |
0x242a0b VINSERTI128 $0x1,%XMM10,%YMM11,%YMM10 |
0x242a11 VBROADCASTSD (%RDX),%ZMM11 [4] |
0x242a17 VINSERTI64X4 $0x1,%YMM9,%ZMM10,%ZMM9 |
0x242a1e VPMULLQ %ZMM1,%ZMM9,%ZMM10 |
0x242a24 VPSLLQ $0x20,%ZMM9,%ZMM9 |
0x242a2b VPSRAQ $0x20,%ZMM9,%ZMM9 |
0x242a32 VPMULLQ %ZMM2,%ZMM9,%ZMM12 |
0x242a38 VPSUBQ %ZMM10,%ZMM0,%ZMM10 |
0x242a3e VPADDQ %ZMM8,%ZMM0,%ZMM0 |
0x242a44 VPSLLQ $0x20,%ZMM10,%ZMM10 |
0x242a4b VPSRAQ $0x20,%ZMM10,%ZMM10 |
0x242a52 VPADDQ %ZMM12,%ZMM10,%ZMM12 |
0x242a58 VSCATTERQPD %ZMM11,(%R8,%ZMM12,8){%K1} [2] |
0x242a5f VPMULLQ %ZMM3,%ZMM9,%ZMM12 |
0x242a65 KXNORW %K0,%K0,%K1 |
0x242a69 VBROADCASTSD (%R10),%ZMM11 [3] |
0x242a6f VPADDQ %ZMM12,%ZMM10,%ZMM12 |
0x242a75 VSCATTERQPD %ZMM11,(%RBX,%ZMM12,8){%K1} [6] |
0x242a7c VPMULLQ %ZMM4,%ZMM9,%ZMM12 |
0x242a82 KXNORW %K0,%K0,%K1 |
0x242a86 VPMULLQ %ZMM5,%ZMM9,%ZMM9 |
0x242a8c VBROADCASTSD (%R13),%ZMM11 [8] |
0x242a93 VPADDQ %ZMM12,%ZMM10,%ZMM12 |
0x242a99 VPADDQ %ZMM9,%ZMM10,%ZMM9 |
0x242a9f VSCATTERQPD %ZMM11,(%R11,%ZMM12,8){%K1} [5] |
0x242aa6 KXNORW %K0,%K0,%K1 |
0x242aaa VBROADCASTSD (%R15),%ZMM11 [1] |
0x242ab0 VSCATTERQPD %ZMM11,(%R9,%ZMM9,8){%K1} [9] |
0x242ab7 JNE 242920 |
/beegfs/hackathon/users/eoseret/qaas_runs/170-854-8685/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/generate_chunk.cpp: 75 - 77 |
-------------------------------------------------------------------------------- |
75: for (int j = (0); j < (yrange); j++) { |
76: for (int i = (0); i < (xrange); i++) { |
77: field.energy0(i, j) = state_energy[0]; |
/beegfs/hackathon/users/eoseret/qaas_runs/170-854-8685/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/context.h: 69 - 69 |
-------------------------------------------------------------------------------- |
69: T &operator()(size_t i, size_t j) const { return data[i + j * sizeX]; } |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.27 - 2.18 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 1.38 - 2.37 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.21 - 2.07 |
Bottlenecks | P8, P9, |
Function | .omp_outlined.#0x242770 |
Source | generate_chunk.cpp:75-77,context.h:69-69 |
Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 56.00 - 96.00 |
CQA cycles if no scalar integer | 44.00 |
CQA cycles if FP arith vectorized | 56.00 - 96.00 |
CQA cycles if fully vectorized | 40.50 |
Front-end cycles | 46.33 |
DIV/SQRT cycles | 16.00 |
P0 cycles | 0.50 |
P1 cycles | 8.00 |
P2 cycles | 0.50 |
P3 cycles | 1.00 |
P4 cycles | 1.67 |
P5 cycles | 1.67 |
P6 cycles | 1.67 |
P7 cycles | 13.00 |
P8 cycles | 20.50 |
P9 cycles | 20.33 |
P10 cycles | 20.17 |
P11 cycles | 44.00 |
P12 cycles | 44.00 |
P13 cycles | 56.00 - 96.00 |
Inter-iter dependencies cycles | 1 |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 82.00 |
Nb uops | 278.00 |
Nb loads | 13.00 |
Nb stores | 4.00 |
Nb stack references | 1.00 |
FLOP/cycle | 0.00 - 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 3.08 - 5.29 |
Bytes prefetched | 0.00 |
Bytes loaded | 40.00 |
Bytes stored | 256.00 |
Stride 0 | 4.00 |
Stride 1 | 0.00 |
Stride n | 0.00 |
Stride unknown | 0.00 |
Stride indirect | 5.00 |
Vectorization ratio all | 34.67 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 100.00 |
Vectorization ratio mul | 100.00 |
Vectorization ratio add_sub | 100.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | 0.00 |
Vectorization ratio other | 21.15 |
Vector-efficiency ratio all | 36.17 |
Vector-efficiency ratio load | 12.50 |
Vector-efficiency ratio store | 100.00 |
Vector-efficiency ratio mul | 100.00 |
Vector-efficiency ratio add_sub | 100.00 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | 12.50 |
Vector-efficiency ratio other | 21.39 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.27 - 2.18 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 1.38 - 2.37 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.21 - 2.07 |
Bottlenecks | P8, P9, |
Function | .omp_outlined.#0x242770 |
Source | generate_chunk.cpp:75-77,context.h:69-69 |
Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 56.00 - 96.00 |
CQA cycles if no scalar integer | 44.00 |
CQA cycles if FP arith vectorized | 56.00 - 96.00 |
CQA cycles if fully vectorized | 40.50 |
Front-end cycles | 46.33 |
DIV/SQRT cycles | 16.00 |
P0 cycles | 0.50 |
P1 cycles | 8.00 |
P2 cycles | 0.50 |
P3 cycles | 1.00 |
P4 cycles | 1.67 |
P5 cycles | 1.67 |
P6 cycles | 1.67 |
P7 cycles | 13.00 |
P8 cycles | 20.50 |
P9 cycles | 20.33 |
P10 cycles | 20.17 |
P11 cycles | 44.00 |
P12 cycles | 44.00 |
P13 cycles | 56.00 - 96.00 |
Inter-iter dependencies cycles | 1 |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 82.00 |
Nb uops | 278.00 |
Nb loads | 13.00 |
Nb stores | 4.00 |
Nb stack references | 1.00 |
FLOP/cycle | 0.00 - 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 3.08 - 5.29 |
Bytes prefetched | 0.00 |
Bytes loaded | 40.00 |
Bytes stored | 256.00 |
Stride 0 | 4.00 |
Stride 1 | 0.00 |
Stride n | 0.00 |
Stride unknown | 0.00 |
Stride indirect | 5.00 |
Vectorization ratio all | 34.67 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 100.00 |
Vectorization ratio mul | 100.00 |
Vectorization ratio add_sub | 100.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | 0.00 |
Vectorization ratio other | 21.15 |
Vector-efficiency ratio all | 36.17 |
Vector-efficiency ratio load | 12.50 |
Vector-efficiency ratio store | 100.00 |
Vector-efficiency ratio mul | 100.00 |
Vector-efficiency ratio add_sub | 100.00 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | 12.50 |
Vector-efficiency ratio other | 21.39 |
Path / |
Function | .omp_outlined.#0x242770 |
Source file and lines | generate_chunk.cpp:75-77 |
Module | exec |
nb instructions | 82 |
nb uops | 278 |
loop length | 413 |
used x86 registers | 13 |
used mmx registers | 0 |
used xmm registers | 8 |
used ymm registers | 5 |
used zmm registers | 11 |
nb stack references | 1 |
micro-operation queue | 46.33 cycles |
front end | 46.33 cycles |
ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 16.00 | 0.50 | 8.00 | 0.50 | 1.00 | 1.67 | 1.67 | 1.67 | 13.00 | 20.50 | 20.33 | 20.17 | 44.00 | 44.00 |
cycles | 16.00 | 0.50 | 8.00 | 0.50 | 1.00 | 1.67 | 1.67 | 1.67 | 13.00 | 20.50 | 20.33 | 20.17 | 44.00 | 44.00 |
Cycles executing div or sqrt instructions | 56.00-96.00 |
Longest recurrence chain latency (RecMII) | 1.00 |
Front-end | 46.33 |
Dispatch | 44.00 |
DIV/SQRT | 56.00-96.00 |
Data deps. | 1.00 |
Overall L1 | 56.00-96.00 |
all | 32% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 19% |
all | 50% |
load | 0% |
store | 100% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 34% |
load | 0% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 0% |
other | 21% |
all | 33% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 20% |
all | 56% |
load | 12% |
store | 100% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
all | 36% |
load | 12% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 12% |
other | 21% |
Instruction | Nb FU | ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
VEXTRACTI32X4 $0x3,%ZMM0,%XMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 4 | 1 |
VPEXTRQ $0x1,%XMM6,%RSI | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 6 | 1 |
VMOVQ %XMM1,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
KXNORW %K0,%K0,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VPEXTRQ $0x1,%XMM9,%RAX | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 6 | 1 |
CQTO | |||||||||||||||||
IDIV %RSI | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9-19 | 7-12 |
VMOVQ %XMM6,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
VMOVQ %RAX,%XMM10 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
VMOVQ %XMM9,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
CQTO | |||||||||||||||||
IDIV %RSI | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9-19 | 7-12 |
VPEXTRQ $0x1,%XMM7,%RSI | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 6 | 1 |
VMOVQ %RAX,%XMM9 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
VPUNPCKLQDQ %XMM10,%XMM9,%XMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 |
VEXTRACTI32X4 $0x2,%ZMM0,%XMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 4 | 1 |
VPEXTRQ $0x1,%XMM10,%RAX | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 6 | 1 |
CQTO | |||||||||||||||||
IDIV %RSI | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9-19 | 7-12 |
VMOVQ %XMM7,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
VMOVQ %RAX,%XMM11 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
VMOVQ %XMM10,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
CQTO | |||||||||||||||||
IDIV %RSI | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9-19 | 7-12 |
VMOVQ %RAX,%XMM10 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
VPUNPCKLQDQ %XMM11,%XMM10,%XMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 |
VEXTRACTI128 $0x1,%YMM1,%XMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.25 |
VINSERTI128 $0x1,%XMM9,%YMM10,%YMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0.50 |
VEXTRACTI128 $0x1,%YMM0,%XMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.25 |
VPEXTRQ $0x1,%XMM11,%RSI | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 6 | 1 |
VPEXTRQ $0x1,%XMM10,%RAX | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 6 | 1 |
CQTO | |||||||||||||||||
IDIV %RSI | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9-19 | 7-12 |
VMOVQ %XMM11,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
VMOVQ %RAX,%XMM12 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
VMOVQ %XMM10,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
CQTO | |||||||||||||||||
IDIV %RSI | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9-19 | 7-12 |
VPEXTRQ $0x1,%XMM1,%RSI | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 6 | 1 |
VMOVQ %RAX,%XMM10 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
VPEXTRQ $0x1,%XMM0,%RAX | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 6 | 1 |
CQTO | |||||||||||||||||
VPUNPCKLQDQ %XMM12,%XMM10,%XMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 |
IDIV %RSI | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9-19 | 7-12 |
MOV %RAX,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VMOVQ %XMM0,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
VMOVQ %RSI,%XMM11 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
CQTO | |||||||||||||||||
IDIV %RDI | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9-19 | 7-12 |
MOV -0x48(%RBP),%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
ADD $-0x8,%R14 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
VMOVQ %RAX,%XMM12 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
VPUNPCKLQDQ %XMM11,%XMM12,%XMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 |
VINSERTI128 $0x1,%XMM10,%YMM11,%YMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0.50 |
VBROADCASTSD (%RDX),%ZMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 5 | 1 |
VINSERTI64X4 $0x1,%YMM9,%ZMM10,%ZMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 1 |
VPMULLQ %ZMM1,%ZMM9,%ZMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
VPSLLQ $0x20,%ZMM9,%ZMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 1 | 1 |
VPSRAQ $0x20,%ZMM9,%ZMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 1 | 1 |
VPMULLQ %ZMM2,%ZMM9,%ZMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
VPSUBQ %ZMM10,%ZMM0,%ZMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.50 |
VPADDQ %ZMM8,%ZMM0,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.50 |
VPSLLQ $0x20,%ZMM10,%ZMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 1 | 1 |
VPSRAQ $0x20,%ZMM10,%ZMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 1 | 1 |
VPADDQ %ZMM12,%ZMM10,%ZMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.50 |
VSCATTERQPD %ZMM11,(%R8,%ZMM12,8){%K1} | 48 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 3.50 | 1.50 | 9 | 9 | 1-40 | 12.14 |
VPMULLQ %ZMM3,%ZMM9,%ZMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
KXNORW %K0,%K0,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VBROADCASTSD (%R10),%ZMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 5 | 1 |
VPADDQ %ZMM12,%ZMM10,%ZMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.50 |
VSCATTERQPD %ZMM11,(%RBX,%ZMM12,8){%K1} | 48 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 3.50 | 1.50 | 9 | 9 | 1-40 | 12.14 |
VPMULLQ %ZMM4,%ZMM9,%ZMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
KXNORW %K0,%K0,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VPMULLQ %ZMM5,%ZMM9,%ZMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
VBROADCASTSD (%R13),%ZMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 5 | 1 |
VPADDQ %ZMM12,%ZMM10,%ZMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.50 |
VPADDQ %ZMM9,%ZMM10,%ZMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.50 |
VSCATTERQPD %ZMM11,(%R11,%ZMM12,8){%K1} | 48 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 3.50 | 1.50 | 9 | 9 | 1-40 | 12.14 |
KXNORW %K0,%K0,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VBROADCASTSD (%R15),%ZMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 5 | 1 |
VSCATTERQPD %ZMM11,(%R9,%ZMM9,8){%K1} | 48 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 3.50 | 1.50 | 9 | 9 | 1-40 | 12.14 |
JNE 242920 <.omp_outlined.+0x1b0> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
Function | .omp_outlined.#0x242770 |
Source file and lines | generate_chunk.cpp:75-77 |
Module | exec |
nb instructions | 82 |
nb uops | 278 |
loop length | 413 |
used x86 registers | 13 |
used mmx registers | 0 |
used xmm registers | 8 |
used ymm registers | 5 |
used zmm registers | 11 |
nb stack references | 1 |
micro-operation queue | 46.33 cycles |
front end | 46.33 cycles |
ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 16.00 | 0.50 | 8.00 | 0.50 | 1.00 | 1.67 | 1.67 | 1.67 | 13.00 | 20.50 | 20.33 | 20.17 | 44.00 | 44.00 |
cycles | 16.00 | 0.50 | 8.00 | 0.50 | 1.00 | 1.67 | 1.67 | 1.67 | 13.00 | 20.50 | 20.33 | 20.17 | 44.00 | 44.00 |
Cycles executing div or sqrt instructions | 56.00-96.00 |
Longest recurrence chain latency (RecMII) | 1.00 |
Front-end | 46.33 |
Dispatch | 44.00 |
DIV/SQRT | 56.00-96.00 |
Data deps. | 1.00 |
Overall L1 | 56.00-96.00 |
all | 32% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 19% |
all | 50% |
load | 0% |
store | 100% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 34% |
load | 0% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 0% |
other | 21% |
all | 33% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 20% |
all | 56% |
load | 12% |
store | 100% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
all | 36% |
load | 12% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 12% |
other | 21% |
Instruction | Nb FU | ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
VEXTRACTI32X4 $0x3,%ZMM0,%XMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 4 | 1 |
VPEXTRQ $0x1,%XMM6,%RSI | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 6 | 1 |
VMOVQ %XMM1,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
KXNORW %K0,%K0,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VPEXTRQ $0x1,%XMM9,%RAX | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 6 | 1 |
CQTO | |||||||||||||||||
IDIV %RSI | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9-19 | 7-12 |
VMOVQ %XMM6,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
VMOVQ %RAX,%XMM10 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
VMOVQ %XMM9,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
CQTO | |||||||||||||||||
IDIV %RSI | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9-19 | 7-12 |
VPEXTRQ $0x1,%XMM7,%RSI | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 6 | 1 |
VMOVQ %RAX,%XMM9 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
VPUNPCKLQDQ %XMM10,%XMM9,%XMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 |
VEXTRACTI32X4 $0x2,%ZMM0,%XMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 4 | 1 |
VPEXTRQ $0x1,%XMM10,%RAX | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 6 | 1 |
CQTO | |||||||||||||||||
IDIV %RSI | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9-19 | 7-12 |
VMOVQ %XMM7,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
VMOVQ %RAX,%XMM11 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
VMOVQ %XMM10,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
CQTO | |||||||||||||||||
IDIV %RSI | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9-19 | 7-12 |
VMOVQ %RAX,%XMM10 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
VPUNPCKLQDQ %XMM11,%XMM10,%XMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 |
VEXTRACTI128 $0x1,%YMM1,%XMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.25 |
VINSERTI128 $0x1,%XMM9,%YMM10,%YMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0.50 |
VEXTRACTI128 $0x1,%YMM0,%XMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.25 |
VPEXTRQ $0x1,%XMM11,%RSI | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 6 | 1 |
VPEXTRQ $0x1,%XMM10,%RAX | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 6 | 1 |
CQTO | |||||||||||||||||
IDIV %RSI | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9-19 | 7-12 |
VMOVQ %XMM11,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
VMOVQ %RAX,%XMM12 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
VMOVQ %XMM10,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
CQTO | |||||||||||||||||
IDIV %RSI | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9-19 | 7-12 |
VPEXTRQ $0x1,%XMM1,%RSI | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 6 | 1 |
VMOVQ %RAX,%XMM10 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
VPEXTRQ $0x1,%XMM0,%RAX | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 6 | 1 |
CQTO | |||||||||||||||||
VPUNPCKLQDQ %XMM12,%XMM10,%XMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 |
IDIV %RSI | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9-19 | 7-12 |
MOV %RAX,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VMOVQ %XMM0,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
VMOVQ %RSI,%XMM11 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
CQTO | |||||||||||||||||
IDIV %RDI | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9-19 | 7-12 |
MOV -0x48(%RBP),%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
ADD $-0x8,%R14 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
VMOVQ %RAX,%XMM12 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
VPUNPCKLQDQ %XMM11,%XMM12,%XMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 |
VINSERTI128 $0x1,%XMM10,%YMM11,%YMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0.50 |
VBROADCASTSD (%RDX),%ZMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 5 | 1 |
VINSERTI64X4 $0x1,%YMM9,%ZMM10,%ZMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 1 |
VPMULLQ %ZMM1,%ZMM9,%ZMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
VPSLLQ $0x20,%ZMM9,%ZMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 1 | 1 |
VPSRAQ $0x20,%ZMM9,%ZMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 1 | 1 |
VPMULLQ %ZMM2,%ZMM9,%ZMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
VPSUBQ %ZMM10,%ZMM0,%ZMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.50 |
VPADDQ %ZMM8,%ZMM0,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.50 |
VPSLLQ $0x20,%ZMM10,%ZMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 1 | 1 |
VPSRAQ $0x20,%ZMM10,%ZMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 1 | 1 |
VPADDQ %ZMM12,%ZMM10,%ZMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.50 |
VSCATTERQPD %ZMM11,(%R8,%ZMM12,8){%K1} | 48 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 3.50 | 1.50 | 9 | 9 | 1-40 | 12.14 |
VPMULLQ %ZMM3,%ZMM9,%ZMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
KXNORW %K0,%K0,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VBROADCASTSD (%R10),%ZMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 5 | 1 |
VPADDQ %ZMM12,%ZMM10,%ZMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.50 |
VSCATTERQPD %ZMM11,(%RBX,%ZMM12,8){%K1} | 48 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 3.50 | 1.50 | 9 | 9 | 1-40 | 12.14 |
VPMULLQ %ZMM4,%ZMM9,%ZMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
KXNORW %K0,%K0,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VPMULLQ %ZMM5,%ZMM9,%ZMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
VBROADCASTSD (%R13),%ZMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 5 | 1 |
VPADDQ %ZMM12,%ZMM10,%ZMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.50 |
VPADDQ %ZMM9,%ZMM10,%ZMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.50 |
VSCATTERQPD %ZMM11,(%R11,%ZMM12,8){%K1} | 48 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 3.50 | 1.50 | 9 | 9 | 1-40 | 12.14 |
KXNORW %K0,%K0,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VBROADCASTSD (%R15),%ZMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 5 | 1 |
VSCATTERQPD %ZMM11,(%R9,%ZMM9,8){%K1} | 48 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 3.50 | 1.50 | 9 | 9 | 1-40 | 12.14 |
JNE 242920 <.omp_outlined.+0x1b0> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |