Loop Id: 181 | Module: exec | Source: advec_mom_kernel.f90:81-241 [...] | Coverage: 3.83% |
---|
Loop Id: 181 | Module: exec | Source: advec_mom_kernel.f90:81-241 [...] | Coverage: 3.83% |
---|
0x435580 VCMPPD $0x1,%ZMM30,%ZMM11,%K1 |
0x435587 VMOVAPD %ZMM6,%ZMM4{%K1}{z} |
0x43558d VSUBPD %ZMM29,%ZMM10,%ZMM5 |
0x435593 VFMADD213PD %ZMM28,%ZMM4,%ZMM5 |
0x435599 VMULPD %ZMM27,%ZMM5,%ZMM4 |
0x43559f VMOVUPD %ZMM4,(%R14,%RSI,8) [3] |
0x4355a6 ADD $0x8,%RSI |
0x4355aa CMP %R9,%RSI |
0x4355ad JGE 435780 |
0x4355b3 LEA (%RCX,%RSI,1),%RDI |
0x4355b7 VMOVUPD (%R13,%RSI,8),%ZMM27 [8] |
0x4355bf VFPCLASSPD $0x50,%ZMM27,%K1 |
0x4355c6 VPBLENDMD %YMM26,%YMM23,%YMM5{%K1} |
0x4355cc VPMOVSXDQ %YMM5,%ZMM5 |
0x4355d2 VPSUBQ %ZMM0,%ZMM5,%ZMM5 |
0x4355d8 VPXOR %XMM6,%XMM6,%XMM6 |
0x4355dc VPMULLQ %ZMM5,%ZMM1,%ZMM6 |
0x4355e2 VPBROADCASTQ %RDI,%ZMM28 |
0x4355e8 VPSUBQ %ZMM2,%ZMM28,%ZMM28 |
0x4355ee VPSLLQ $0x3,%ZMM28,%ZMM28 |
0x4355f5 VPADDQ %ZMM16,%ZMM28,%ZMM28 |
0x4355fb VPADDQ %ZMM28,%ZMM8,%ZMM29 |
0x435601 VPADDQ %ZMM6,%ZMM29,%ZMM6 |
0x435607 VPXORD %XMM29,%XMM29,%XMM29 |
0x43560d KXNORW %K0,%K0,%K2 |
0x435611 VGATHERQPD (,%ZMM6,1),%ZMM29{%K2} [7] |
0x43561c VPMULLQ %ZMM5,%ZMM3,%ZMM5 |
0x435622 VPADDQ %ZMM28,%ZMM9,%ZMM6 |
0x435628 VPADDQ %ZMM5,%ZMM6,%ZMM5 |
0x43562e VPXORD %XMM28,%XMM28,%XMM28 |
0x435634 KXNORW %K0,%K0,%K2 |
0x435638 VGATHERQPD (,%ZMM5,1),%ZMM28{%K2} [5] |
0x435643 VPBLENDMD %YMM24,%YMM25,%YMM5{%K1} |
0x435649 VPMOVSXDQ %YMM5,%ZMM5 |
0x43564f VPSUBQ %ZMM0,%ZMM5,%ZMM5 |
0x435655 VPMULLQ %ZMM5,%ZMM3,%ZMM5 |
0x43565b VPADDQ %ZMM5,%ZMM6,%ZMM5 |
0x435661 VXORPD %XMM30,%XMM30,%XMM30 |
0x435667 KXNORW %K0,%K0,%K2 |
0x43566b VGATHERQPD (,%ZMM5,1),%ZMM30{%K2} [2] |
0x435676 VPBLENDMD %YMM23,%YMM26,%YMM5{%K1} |
0x43567c VANDPD %ZMM7,%ZMM27,%ZMM31 |
0x435682 VDIVPD %ZMM29,%ZMM31,%ZMM29 |
0x435688 VPMOVSXDQ %YMM5,%ZMM5 |
0x43568e VPSUBQ %ZMM0,%ZMM5,%ZMM5 |
0x435694 VPMULLQ %ZMM5,%ZMM3,%ZMM5 |
0x43569a VPADDQ %ZMM5,%ZMM6,%ZMM5 |
0x4356a0 VXORPD %XMM31,%XMM31,%XMM31 |
0x4356a6 KXNORW %K0,%K0,%K2 |
0x4356aa VGATHERQPD (,%ZMM5,1),%ZMM31{%K2} [1] |
0x4356b5 VPXOR %XMM6,%XMM6,%XMM6 |
0x4356b9 VSUBPD %ZMM30,%ZMM28,%ZMM5 |
0x4356bf VSUBPD %ZMM28,%ZMM31,%ZMM31 |
0x4356c5 VMULPD %ZMM5,%ZMM31,%ZMM30 |
0x4356cb VCMPPD $0x1,%ZMM30,%ZMM6,%K0 |
0x4356d2 KORTESTB %K0,%K0 |
0x4356d6 JE 435580 |
0x4356dc VCMPPD $0x1,%ZMM30,%ZMM11,%K2 |
0x4356e3 VPBLENDMD %YMM26,%YMM25,%YMM4{%K1} |
0x4356e9 VMOVSD (%R15,%RDX,8),%XMM17 [4] |
0x4356f0 VANDPD %ZMM7,%ZMM5,%ZMM5 |
0x4356f6 VANDPD %ZMM7,%ZMM31,%ZMM6 |
0x4356fc VSUBPD %ZMM29,%ZMM12,%ZMM18 |
0x435702 VMULPD %ZMM18,%ZMM6,%ZMM18 |
0x435708 VDIVSD %XMM17,%XMM13,%XMM19 |
0x43570e VBROADCASTSD %XMM19,%ZMM19 |
0x435714 VCMPPD $0x2,%ZMM6,%ZMM5,%K1 |
0x43571b VMOVAPD %ZMM5,%ZMM6{%K1} |
0x435721 VPMOVSXDQ %YMM4,%ZMM4 |
0x435727 VPSUBQ %ZMM0,%ZMM4,%ZMM4 |
0x43572d VXORPD %XMM21,%XMM21,%XMM21 |
0x435733 VGATHERQPD (%R15,%ZMM4,8),%ZMM21{%K2} [6] |
0x43573a VFMADD213PD %ZMM5,%ZMM29,%ZMM5 |
0x435740 VDIVPD %ZMM21,%ZMM5,%ZMM4 |
0x435746 VFMADD231PD %ZMM19,%ZMM18,%ZMM4 |
0x43574c VMULSD %XMM14,%XMM17,%XMM5 |
0x435752 VBROADCASTSD %XMM5,%ZMM5 |
0x435758 VMULPD %ZMM4,%ZMM5,%ZMM4 |
0x43575e VCMPPD $0x2,%ZMM6,%ZMM4,%K1 |
0x435765 VMOVAPD %ZMM4,%ZMM6{%K1} |
0x43576b VFPCLASSPD $0x56,%ZMM31,%K1 |
0x435772 VXORPD %ZMM15,%ZMM6,%ZMM6{%K1} |
0x435778 JMP 435580 |
/scratch_na/users/xoserete/qaas_runs/171-215-0463/intel/CloverLeafFC/build/CloverLeafFC/CloverLeaf_ref/kernels/advec_mom_kernel.f90: 81 - 241 |
-------------------------------------------------------------------------------- |
81: IF(mom_sweep.EQ.1)THEN ! x 1 |
[...] |
215: IF(node_flux(j,k).LT.0.0)THEN |
[...] |
227: sigma=ABS(node_flux(j,k))/(node_mass_pre(j,donor)) |
228: width=celldy(k) |
229: vdiffuw=vel1(j,donor)-vel1(j,upwind) |
230: vdiffdw=vel1(j,downwind)-vel1(j,donor) |
231: limiter=0.0 |
232: IF(vdiffuw*vdiffdw.GT.0.0)THEN |
233: auw=ABS(vdiffuw) |
234: adw=ABS(vdiffdw) |
235: wind=1.0_8 |
236: IF(vdiffdw.LE.0.0) wind=-1.0_8 |
237: limiter=wind*MIN(width*((2.0_8-sigma)*adw/width+(1.0_8+sigma)*auw/celldy(dif))/6.0_8,auw,adw) |
238: ENDIF |
239: advec_vel_s=vel1(j,donor)+(1.0_8-sigma)*limiter |
240: mom_flux(j,k)=advec_vel_s*node_flux(j,k) |
241: ENDDO |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.08 |
CQA speedup if FP arith vectorized | 1.01 |
CQA speedup if fully vectorized | 1.03 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | NA |
Bottlenecks | |
Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
Source | advec_mom_kernel.f90:81-81,advec_mom_kernel.f90:215-215,advec_mom_kernel.f90:227-241 |
Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 34.75 |
CQA cycles if no scalar integer | 32.25 |
CQA cycles if FP arith vectorized | 34.56 |
CQA cycles if fully vectorized | 33.75 |
Front-end cycles | 17.83 |
DIV/SQRT cycles | 34.75 |
P0 cycles | 6.00 |
P1 cycles | 12.50 |
P2 cycles | 12.50 |
P3 cycles | 0.50 |
P4 cycles | 34.75 |
P5 cycles | 2.00 |
P6 cycles | 0.50 |
P7 cycles | 0.50 |
P8 cycles | 0.50 |
P9 cycles | 1.00 |
P10 cycles | 12.50 |
P11 cycles | 26.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | 36.16 - 94.99 |
Stall cycles (UFS) | 20.56 - 79.25 |
Nb insns | 70.00 |
Nb uops | 107.00 |
Nb loads | 6.00 |
Nb stores | 1.00 |
Nb stack references | 0.00 |
FLOP/cycle | 2.79 |
Nb FLOP add-sub | 28.00 |
Nb FLOP mul | 24.50 |
Nb FLOP fma | 16.00 |
Nb FLOP div | 12.50 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 12.25 |
Bytes prefetched | 0.00 |
Bytes loaded | 356.00 |
Bytes stored | 64.00 |
Stride 0 | 0.50 |
Stride 1 | 2.00 |
Stride n | 0.00 |
Stride unknown | 0.00 |
Stride indirect | 2.50 |
Vectorization ratio all | 94.77 |
Vectorization ratio load | 92.86 |
Vectorization ratio store | 100.00 |
Vectorization ratio mul | 94.44 |
Vectorization ratio add_sub | 100.00 |
Vectorization ratio fma | 100.00 |
Vectorization ratio div_sqrt | 83.33 |
Vectorization ratio other | 93.88 |
Vector-efficiency ratio all | 81.02 |
Vector-efficiency ratio load | 93.75 |
Vector-efficiency ratio store | 100.00 |
Vector-efficiency ratio mul | 95.14 |
Vector-efficiency ratio add_sub | 100.00 |
Vector-efficiency ratio fma | 100.00 |
Vector-efficiency ratio div_sqrt | 85.42 |
Vector-efficiency ratio other | 66.17 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.07 |
CQA speedup if FP arith vectorized | 1.01 |
CQA speedup if fully vectorized | 1.04 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | NA |
Bottlenecks | P0, P5, |
Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
Source | advec_mom_kernel.f90:81-81,advec_mom_kernel.f90:215-215,advec_mom_kernel.f90:227-241 |
Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 40.50 |
CQA cycles if no scalar integer | 38.00 |
CQA cycles if FP arith vectorized | 40.13 |
CQA cycles if fully vectorized | 38.88 |
Front-end cycles | 20.50 |
DIV/SQRT cycles | 40.50 |
P0 cycles | 7.00 |
P1 cycles | 14.00 |
P2 cycles | 14.00 |
P3 cycles | 0.50 |
P4 cycles | 40.50 |
P5 cycles | 2.00 |
P6 cycles | 0.50 |
P7 cycles | 0.50 |
P8 cycles | 0.50 |
P9 cycles | 1.00 |
P10 cycles | 14.00 |
P11 cycles | 36.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | 42.48 - 95.20 |
Stall cycles (UFS) | 24.34 - 76.95 |
Nb insns | 83.00 |
Nb uops | 123.00 |
Nb loads | 7.00 |
Nb stores | 1.00 |
Nb stack references | 0.00 |
FLOP/cycle | 3.21 |
Nb FLOP add-sub | 32.00 |
Nb FLOP mul | 33.00 |
Nb FLOP fma | 24.00 |
Nb FLOP div | 17.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 11.26 |
Bytes prefetched | 0.00 |
Bytes loaded | 392.00 |
Bytes stored | 64.00 |
Stride 0 | 1.00 |
Stride 1 | 2.00 |
Stride n | 0.00 |
Stride unknown | 0.00 |
Stride indirect | 3.00 |
Vectorization ratio all | 91.67 |
Vectorization ratio load | 85.71 |
Vectorization ratio store | 100.00 |
Vectorization ratio mul | 88.89 |
Vectorization ratio add_sub | 100.00 |
Vectorization ratio fma | 100.00 |
Vectorization ratio div_sqrt | 66.67 |
Vectorization ratio other | 92.11 |
Vector-efficiency ratio all | 79.86 |
Vector-efficiency ratio load | 87.50 |
Vector-efficiency ratio store | 100.00 |
Vector-efficiency ratio mul | 90.28 |
Vector-efficiency ratio add_sub | 100.00 |
Vector-efficiency ratio fma | 100.00 |
Vector-efficiency ratio div_sqrt | 70.83 |
Vector-efficiency ratio other | 68.75 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.09 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 1.01 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | NA |
Bottlenecks | P0, P5, |
Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
Source | advec_mom_kernel.f90:81-81,advec_mom_kernel.f90:215-215,advec_mom_kernel.f90:227-241 |
Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 29.00 |
CQA cycles if no scalar integer | 26.50 |
CQA cycles if FP arith vectorized | 29.00 |
CQA cycles if fully vectorized | 28.63 |
Front-end cycles | 15.17 |
DIV/SQRT cycles | 29.00 |
P0 cycles | 5.00 |
P1 cycles | 11.00 |
P2 cycles | 11.00 |
P3 cycles | 0.50 |
P4 cycles | 29.00 |
P5 cycles | 2.00 |
P6 cycles | 0.50 |
P7 cycles | 0.50 |
P8 cycles | 0.50 |
P9 cycles | 1.00 |
P10 cycles | 11.00 |
P11 cycles | 16.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | 29.85 - 94.78 |
Stall cycles (UFS) | 16.79 - 81.54 |
Nb insns | 57.00 |
Nb uops | 91.00 |
Nb loads | 5.00 |
Nb stores | 1.00 |
Nb stack references | 0.00 |
FLOP/cycle | 2.21 |
Nb FLOP add-sub | 24.00 |
Nb FLOP mul | 16.00 |
Nb FLOP fma | 8.00 |
Nb FLOP div | 8.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 13.24 |
Bytes prefetched | 0.00 |
Bytes loaded | 320.00 |
Bytes stored | 64.00 |
Stride 0 | 0.00 |
Stride 1 | 2.00 |
Stride n | 0.00 |
Stride unknown | 0.00 |
Stride indirect | 2.00 |
Vectorization ratio all | 97.87 |
Vectorization ratio load | 100.00 |
Vectorization ratio store | 100.00 |
Vectorization ratio mul | 100.00 |
Vectorization ratio add_sub | 100.00 |
Vectorization ratio fma | 100.00 |
Vectorization ratio div_sqrt | 100.00 |
Vectorization ratio other | 95.65 |
Vector-efficiency ratio all | 82.18 |
Vector-efficiency ratio load | 100.00 |
Vector-efficiency ratio store | 100.00 |
Vector-efficiency ratio mul | 100.00 |
Vector-efficiency ratio add_sub | 100.00 |
Vector-efficiency ratio fma | 100.00 |
Vector-efficiency ratio div_sqrt | 100.00 |
Vector-efficiency ratio other | 63.59 |
Path / |
Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
Source file and lines | advec_mom_kernel.f90:81-241 |
Module | exec |
nb instructions | 70 |
nb uops | 107 |
loop length | 428.50 |
used x86 registers | 7 |
used mmx registers | 0 |
used xmm registers | 8 |
used ymm registers | 5.50 |
used zmm registers | 20.50 |
nb stack references | 0 |
ADD-SUB / MUL ratio | 1.15 |
micro-operation queue | 17.83 cycles |
front end | 17.83 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 34.75 | 6.00 | 12.50 | 12.50 | 0.50 | 34.75 | 2.00 | 0.50 | 0.50 | 0.50 | 1.00 | 12.50 |
cycles | 34.75 | 6.00 | 12.50 | 12.50 | 0.50 | 34.75 | 2.00 | 0.50 | 0.50 | 0.50 | 1.00 | 12.50 |
Cycles executing div or sqrt instructions | 26.00 |
Longest recurrence chain latency (RecMII) | 0.00 |
FE+BE cycles | 36.16-94.99 |
Stall cycles | 20.56-79.25 |
RS full (events) | 35.16-0.43 |
PRF_FLOAT full (events) | 0.04-82.50 |
Front-end | 17.83 |
Dispatch | 34.75 |
DIV/SQRT | 26.00 |
Data deps. | 0.00 |
Overall L1 | 34.75 |
all | 96% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 92% |
all | 94% |
load | 92% |
store | 100% |
mul | 90% |
add-sub | 100% |
fma | 100% |
div/sqrt | 83% |
other | 95% |
all | 94% |
load | 92% |
store | 100% |
mul | 94% |
add-sub | 100% |
fma | 100% |
div/sqrt | 83% |
other | 93% |
all | 74% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 43% |
all | 88% |
load | 93% |
store | 100% |
mul | 91% |
add-sub | 100% |
fma | 100% |
div/sqrt | 85% |
other | 84% |
all | 81% |
load | 93% |
store | 100% |
mul | 95% |
add-sub | 100% |
fma | 100% |
div/sqrt | 85% |
other | 66% |
Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
Source file and lines | advec_mom_kernel.f90:81-241 |
Module | exec |
nb instructions | 83 |
nb uops | 123 |
loop length | 509 |
used x86 registers | 8 |
used mmx registers | 0 |
used xmm registers | 11 |
used ymm registers | 6 |
used zmm registers | 23 |
nb stack references | 0 |
ADD-SUB / MUL ratio | 0.80 |
micro-operation queue | 20.50 cycles |
front end | 20.50 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 40.50 | 7.00 | 14.00 | 14.00 | 0.50 | 40.50 | 2.00 | 0.50 | 0.50 | 0.50 | 1.00 | 14.00 |
cycles | 40.50 | 7.00 | 14.00 | 14.00 | 0.50 | 40.50 | 2.00 | 0.50 | 0.50 | 0.50 | 1.00 | 14.00 |
Cycles executing div or sqrt instructions | 36.00 |
Longest recurrence chain latency (RecMII) | 0.00 |
FE+BE cycles | 42.48-95.20 |
Stall cycles | 24.34-76.95 |
RS full (events) | 41.22-0.48 |
PRF_FLOAT full (events) | 0.08-80.19 |
Front-end | 20.50 |
Dispatch | 40.50 |
DIV/SQRT | 36.00 |
Data deps. | 0.00 |
Overall L1 | 40.50 |
all | 96% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 92% |
all | 88% |
load | 85% |
store | 100% |
mul | 80% |
add-sub | 100% |
fma | 100% |
div/sqrt | 66% |
other | 91% |
all | 91% |
load | 85% |
store | 100% |
mul | 88% |
add-sub | 100% |
fma | 100% |
div/sqrt | 66% |
other | 92% |
all | 73% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 43% |
all | 84% |
load | 87% |
store | 100% |
mul | 82% |
add-sub | 100% |
fma | 100% |
div/sqrt | 70% |
other | 83% |
all | 79% |
load | 87% |
store | 100% |
mul | 90% |
add-sub | 100% |
fma | 100% |
div/sqrt | 70% |
other | 68% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
VCMPPD $0x1,%ZMM30,%ZMM11,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVAPD %ZMM6,%ZMM4{%K1}{z} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VSUBPD %ZMM29,%ZMM10,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VFMADD213PD %ZMM28,%ZMM4,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM27,%ZMM5,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM4,(%R14,%RSI,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0-1 | 1 |
ADD $0x8,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CMP %R9,%RSI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JGE 435780 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x40f0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
LEA (%RCX,%RSI,1),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VMOVUPD (%R13,%RSI,8),%ZMM27 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
VFPCLASSPD $0x50,%ZMM27,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPBLENDMD %YMM26,%YMM23,%YMM5{%K1} | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VPMOVSXDQ %YMM5,%ZMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPSUBQ %ZMM0,%ZMM5,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPXOR %XMM6,%XMM6,%XMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VPMULLQ %ZMM5,%ZMM1,%ZMM6 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPBROADCASTQ %RDI,%ZMM28 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPSUBQ %ZMM2,%ZMM28,%ZMM28 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPSLLQ $0x3,%ZMM28,%ZMM28 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2-4 | 1 |
VPADDQ %ZMM16,%ZMM28,%ZMM28 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPADDQ %ZMM28,%ZMM8,%ZMM29 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPADDQ %ZMM6,%ZMM29,%ZMM6 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPXORD %XMM29,%XMM29,%XMM29 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.33 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERQPD (,%ZMM6,1),%ZMM29{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VPMULLQ %ZMM5,%ZMM3,%ZMM5 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM28,%ZMM9,%ZMM6 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPADDQ %ZMM5,%ZMM6,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPXORD %XMM28,%XMM28,%XMM28 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.33 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERQPD (,%ZMM5,1),%ZMM28{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VPBLENDMD %YMM24,%YMM25,%YMM5{%K1} | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VPMOVSXDQ %YMM5,%ZMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPSUBQ %ZMM0,%ZMM5,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPMULLQ %ZMM5,%ZMM3,%ZMM5 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM5,%ZMM6,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VXORPD %XMM30,%XMM30,%XMM30 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERQPD (,%ZMM5,1),%ZMM30{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VPBLENDMD %YMM23,%YMM26,%YMM5{%K1} | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VANDPD %ZMM7,%ZMM27,%ZMM31 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VDIVPD %ZMM29,%ZMM31,%ZMM29 | 3 | 2.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 22-24 | 16 |
VPMOVSXDQ %YMM5,%ZMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPSUBQ %ZMM0,%ZMM5,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPMULLQ %ZMM5,%ZMM3,%ZMM5 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM5,%ZMM6,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VXORPD %XMM31,%XMM31,%XMM31 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERQPD (,%ZMM5,1),%ZMM31{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VPXOR %XMM6,%XMM6,%XMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VSUBPD %ZMM30,%ZMM28,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VSUBPD %ZMM28,%ZMM31,%ZMM31 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMULPD %ZMM5,%ZMM31,%ZMM30 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM30,%ZMM6,%K0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
KORTESTB %K0,%K0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
JE 435580 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x3ef0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
VCMPPD $0x1,%ZMM30,%ZMM11,%K2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPBLENDMD %YMM26,%YMM25,%YMM4{%K1} | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VMOVSD (%R15,%RDX,8),%XMM17 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VANDPD %ZMM7,%ZMM5,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VANDPD %ZMM7,%ZMM31,%ZMM6 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VSUBPD %ZMM29,%ZMM12,%ZMM18 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMULPD %ZMM18,%ZMM6,%ZMM18 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VDIVSD %XMM17,%XMM13,%XMM19 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13-15 | 4 |
VBROADCASTSD %XMM19,%ZMM19 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VCMPPD $0x2,%ZMM6,%ZMM5,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVAPD %ZMM5,%ZMM6{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VPMOVSXDQ %YMM4,%ZMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPSUBQ %ZMM0,%ZMM4,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VXORPD %XMM21,%XMM21,%XMM21 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VGATHERQPD (%R15,%ZMM4,8),%ZMM21{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VFMADD213PD %ZMM5,%ZMM29,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VDIVPD %ZMM21,%ZMM5,%ZMM4 | 3 | 2.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 22-24 | 16 |
VFMADD231PD %ZMM19,%ZMM18,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM14,%XMM17,%XMM5 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VBROADCASTSD %XMM5,%ZMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMULPD %ZMM4,%ZMM5,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x2,%ZMM6,%ZMM4,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVAPD %ZMM4,%ZMM6{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VFPCLASSPD $0x56,%ZMM31,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VXORPD %ZMM15,%ZMM6,%ZMM6{%K1} | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
JMP 435580 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x3ef0> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
Source file and lines | advec_mom_kernel.f90:81-241 |
Module | exec |
nb instructions | 57 |
nb uops | 91 |
loop length | 348 |
used x86 registers | 6 |
used mmx registers | 0 |
used xmm registers | 5 |
used ymm registers | 5 |
used zmm registers | 18 |
nb stack references | 0 |
ADD-SUB / MUL ratio | 1.50 |
micro-operation queue | 15.17 cycles |
front end | 15.17 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 29.00 | 5.00 | 11.00 | 11.00 | 0.50 | 29.00 | 2.00 | 0.50 | 0.50 | 0.50 | 1.00 | 11.00 |
cycles | 29.00 | 5.00 | 11.00 | 11.00 | 0.50 | 29.00 | 2.00 | 0.50 | 0.50 | 0.50 | 1.00 | 11.00 |
Cycles executing div or sqrt instructions | 16.00 |
Longest recurrence chain latency (RecMII) | 0.00 |
FE+BE cycles | 29.85-94.78 |
Stall cycles | 16.79-81.54 |
RS full (events) | 29.10-0.39 |
Front-end | 15.17 |
Dispatch | 29.00 |
DIV/SQRT | 16.00 |
Data deps. | 0.00 |
Overall L1 | 29.00 |
all | 96% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 91% |
all | 100% |
load | 100% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 100% |
all | 97% |
load | 100% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 95% |
all | 74% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 42% |
all | 92% |
load | 100% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 86% |
all | 82% |
load | 100% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 63% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
VCMPPD $0x1,%ZMM30,%ZMM11,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVAPD %ZMM6,%ZMM4{%K1}{z} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VSUBPD %ZMM29,%ZMM10,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VFMADD213PD %ZMM28,%ZMM4,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM27,%ZMM5,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM4,(%R14,%RSI,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0-1 | 1 |
ADD $0x8,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CMP %R9,%RSI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JGE 435780 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x40f0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
LEA (%RCX,%RSI,1),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VMOVUPD (%R13,%RSI,8),%ZMM27 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
VFPCLASSPD $0x50,%ZMM27,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPBLENDMD %YMM26,%YMM23,%YMM5{%K1} | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VPMOVSXDQ %YMM5,%ZMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPSUBQ %ZMM0,%ZMM5,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPXOR %XMM6,%XMM6,%XMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VPMULLQ %ZMM5,%ZMM1,%ZMM6 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPBROADCASTQ %RDI,%ZMM28 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPSUBQ %ZMM2,%ZMM28,%ZMM28 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPSLLQ $0x3,%ZMM28,%ZMM28 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2-4 | 1 |
VPADDQ %ZMM16,%ZMM28,%ZMM28 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPADDQ %ZMM28,%ZMM8,%ZMM29 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPADDQ %ZMM6,%ZMM29,%ZMM6 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPXORD %XMM29,%XMM29,%XMM29 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.33 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERQPD (,%ZMM6,1),%ZMM29{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VPMULLQ %ZMM5,%ZMM3,%ZMM5 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM28,%ZMM9,%ZMM6 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPADDQ %ZMM5,%ZMM6,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPXORD %XMM28,%XMM28,%XMM28 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.33 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERQPD (,%ZMM5,1),%ZMM28{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VPBLENDMD %YMM24,%YMM25,%YMM5{%K1} | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VPMOVSXDQ %YMM5,%ZMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPSUBQ %ZMM0,%ZMM5,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPMULLQ %ZMM5,%ZMM3,%ZMM5 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM5,%ZMM6,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VXORPD %XMM30,%XMM30,%XMM30 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERQPD (,%ZMM5,1),%ZMM30{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VPBLENDMD %YMM23,%YMM26,%YMM5{%K1} | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VANDPD %ZMM7,%ZMM27,%ZMM31 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VDIVPD %ZMM29,%ZMM31,%ZMM29 | 3 | 2.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 22-24 | 16 |
VPMOVSXDQ %YMM5,%ZMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPSUBQ %ZMM0,%ZMM5,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPMULLQ %ZMM5,%ZMM3,%ZMM5 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM5,%ZMM6,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VXORPD %XMM31,%XMM31,%XMM31 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERQPD (,%ZMM5,1),%ZMM31{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VPXOR %XMM6,%XMM6,%XMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VSUBPD %ZMM30,%ZMM28,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VSUBPD %ZMM28,%ZMM31,%ZMM31 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMULPD %ZMM5,%ZMM31,%ZMM30 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM30,%ZMM6,%K0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
KORTESTB %K0,%K0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
JE 435580 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x3ef0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |