Loop Id: 115 | Module: exec | Source: advec_mom.cpp:181-211 [...] | Coverage: 3% |
---|
Loop Id: 115 | Module: exec | Source: advec_mom.cpp:181-211 [...] | Coverage: 3% |
---|
0x420a20 VMOVAPD %ZMM9,%ZMM5{%K2}{z} |
0x420a26 VSUBPD %ZMM3,%ZMM27,%ZMM3 |
0x420a2c VFMADD213PD %ZMM4,%ZMM5,%ZMM3 |
0x420a32 VMULPD %ZMM2,%ZMM3,%ZMM2 |
0x420a38 VPMULLQ %ZMM1,%ZMM23,%ZMM1 |
0x420a3e VPADDQ %ZMM0,%ZMM1,%ZMM0 |
0x420a44 KXNORW %K0,%K0,%K1 |
0x420a48 VSCATTERQPD %ZMM2,(%RDI,%ZMM0,8){%K1} [3] |
0x420a4f VPADDQ %ZMM28,%ZMM17,%ZMM17 |
0x420a55 ADD $0x8,%RBX |
0x420a59 CMP %R13,%RBX |
0x420a5c JAE 420c5a |
0x420a62 VMOVDQA64 %ZMM17,%ZMM0 |
0x420a68 VMOVDQA64 %ZMM16,%ZMM1 |
0x420a6e MOV $0x451520,%RAX |
0x420a75 CALL %RAX |
0x420a77 VPMOVQD %ZMM0,%YMM0 |
0x420a7d VPADDD %YMM0,%YMM18,%YMM29 |
0x420a83 VMOVDQA64 %ZMM17,%ZMM0 |
0x420a89 VMOVDQA64 %ZMM16,%ZMM1 |
0x420a8f CALL 4513a0 <__svml_i64rem8_z0> |
0x420a95 VPADDQ %ZMM19,%ZMM0,%ZMM0 |
0x420a9b VPSLLQ $0x20,%ZMM0,%ZMM0 |
0x420aa2 VPSRAQ $0x20,%ZMM0,%ZMM0 |
0x420aa9 VPMOVSXDQ %YMM29,%ZMM1 |
0x420aaf VPXOR %XMM2,%XMM2,%XMM2 |
0x420ab3 VPMULLQ %ZMM1,%ZMM20,%ZMM2 |
0x420ab9 VPADDQ %ZMM0,%ZMM2,%ZMM3 |
0x420abf VPXOR %XMM2,%XMM2,%XMM2 |
0x420ac3 KXNORW %K0,%K0,%K1 |
0x420ac7 MOV -0x60(%RBP),%RAX [10] |
0x420acb VGATHERQPD (%RAX,%ZMM3,8),%ZMM2{%K1} [5] |
0x420ad2 VFPCLASSPD $0x50,%ZMM2,%K1 |
0x420ad9 VPCMPEQD %YMM6,%YMM6,%YMM6 |
0x420add VPSUBD %YMM6,%YMM29,%YMM5 |
0x420ae3 VPMOVSXDQ %YMM5,%ZMM7 |
0x420ae9 VPBLENDMQ %ZMM7,%ZMM1,%ZMM4{%K1} |
0x420aef VPXOR %XMM3,%XMM3,%XMM3 |
0x420af3 VPMULLQ %ZMM4,%ZMM21,%ZMM3 |
0x420af9 VPADDQ %ZMM0,%ZMM3,%ZMM3 |
0x420aff VXORPD %XMM8,%XMM8,%XMM8 |
0x420b04 KXNORW %K0,%K0,%K2 |
0x420b08 MOV -0x58(%RBP),%RAX [10] |
0x420b0c VGATHERQPD (%RAX,%ZMM3,8),%ZMM8{%K2} [1] |
0x420b13 VPADDD %YMM6,%YMM29,%YMM6 |
0x420b19 VMOVDQA %YMM6,%YMM9 |
0x420b1d VANDPD %ZMM25,%ZMM2,%ZMM3 |
0x420b23 VDIVPD %ZMM8,%ZMM3,%ZMM3 |
0x420b29 VPADDD %YMM24,%YMM29,%YMM9{%K1} |
0x420b2f VPMULLQ %ZMM4,%ZMM22,%ZMM4 |
0x420b35 VPADDQ %ZMM0,%ZMM4,%ZMM8 |
0x420b3b VPXOR %XMM4,%XMM4,%XMM4 |
0x420b3f KXNORW %K0,%K0,%K2 |
0x420b43 VGATHERQPD (%R15,%ZMM8,8),%ZMM4{%K2} [2] |
0x420b4a VPMOVSXDQ %YMM9,%ZMM8 |
0x420b50 VPMULLQ %ZMM8,%ZMM22,%ZMM8 |
0x420b56 VPADDQ %ZMM0,%ZMM8,%ZMM8 |
0x420b5c VPXOR %XMM9,%XMM9,%XMM9 |
0x420b61 KXNORW %K0,%K0,%K2 |
0x420b65 VGATHERQPD (%R15,%ZMM8,8),%ZMM9{%K2} [9] |
0x420b6c VMOVDQA64 %ZMM1,%ZMM7{%K1} |
0x420b72 VPMULLQ %ZMM7,%ZMM22,%ZMM7 |
0x420b78 VPADDQ %ZMM0,%ZMM7,%ZMM7 |
0x420b7e VXORPD %XMM10,%XMM10,%XMM10 |
0x420b83 KXNORW %K0,%K0,%K2 |
0x420b87 VGATHERQPD (%R15,%ZMM7,8),%ZMM10{%K2} [4] |
0x420b8e VSUBPD %ZMM9,%ZMM4,%ZMM8 |
0x420b94 VSUBPD %ZMM4,%ZMM10,%ZMM7 |
0x420b9a VMULPD %ZMM8,%ZMM7,%ZMM9 |
0x420ba0 VCMPPD $0x1,%ZMM9,%ZMM26,%K2 |
0x420ba7 KORTESTB %K2,%K2 |
0x420bab VXORPD %XMM9,%XMM9,%XMM9 |
0x420bb0 JE 420a20 |
0x420bb6 MOV -0x50(%RBP),%RAX [10] |
0x420bba MOV 0x8(%RAX),%RAX [11] |
0x420bbe KMOVQ %K2,%K3 |
0x420bc3 VGATHERQPD (%RAX,%ZMM1,8),%ZMM9{%K3} [8] |
0x420bca VMOVDQA32 %YMM5,%YMM6{%K1} |
0x420bd0 VANDPD %ZMM25,%ZMM8,%ZMM5 |
0x420bd6 VXORPD %XMM8,%XMM8,%XMM8 |
0x420bdb KMOVQ %K2,%K1 |
0x420be0 VGATHERDPD (%RAX,%YMM6,8),%ZMM8{%K1} [6] |
0x420be7 VANDPD %ZMM25,%ZMM7,%ZMM6 |
0x420bed VBROADCASTSD 0x43ac1(%RIP),%ZMM10 [7] |
0x420bf7 VSUBPD %ZMM3,%ZMM10,%ZMM10 |
0x420bfd VMULPD %ZMM10,%ZMM6,%ZMM10 |
0x420c03 VDIVPD %ZMM9,%ZMM10,%ZMM10 |
0x420c09 VCMPPD $0x1,%ZMM7,%ZMM26,%K1 |
0x420c10 VMINPD %ZMM6,%ZMM5,%ZMM6 |
0x420c16 VFMADD213PD %ZMM5,%ZMM3,%ZMM5 |
0x420c1c VDIVPD %ZMM8,%ZMM5,%ZMM5 |
0x420c22 VADDPD %ZMM10,%ZMM5,%ZMM5 |
0x420c28 VMULPD %ZMM30,%ZMM9,%ZMM7 |
0x420c2e VMULPD %ZMM5,%ZMM7,%ZMM5 |
0x420c34 VMINPD %ZMM6,%ZMM5,%ZMM5 |
0x420c3a VXORPD %ZMM31,%ZMM5,%ZMM9 |
0x420c40 VMOVAPD %ZMM5,%ZMM9{%K1} |
0x420c46 JMP 420a20 |
/scratch_na/users/xoserete/qaas_runs/171-415-4687/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/context.h: 46 - 69 |
-------------------------------------------------------------------------------- |
46: T &operator[](size_t i) const { return data[i]; } |
[...] |
69: T &operator()(size_t i, size_t j) const { return data[i + j * sizeX]; } |
/scratch_na/users/xoserete/qaas_runs/171-415-4687/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/advec_mom.cpp: 181 - 211 |
-------------------------------------------------------------------------------- |
181: for (int j = (y_min - 1 + 1); j < (y_max + 1 + 2); j++) { |
182: for (int i = (x_min + 1); i < (x_max + 1 + 2); i++) |
183: ({ |
184: int upwind, donor, downwind, dif; |
185: double sigma, width, limiter, vdiffuw, vdiffdw, auw, adw, wind, advec_vel_s; |
186: if (node_flux(i, j) < 0.0) { |
[...] |
192: upwind = j - 1; |
193: donor = j; |
194: downwind = j + 1; |
195: dif = upwind; |
196: } |
197: sigma = std::fabs(node_flux(i, j)) / (node_mass_pre(i, donor)); |
198: width = celldy[j]; |
199: vdiffuw = vel1(i, donor) - vel1(i, upwind); |
200: vdiffdw = vel1(i, downwind) - vel1(i, donor); |
201: limiter = 0.0; |
202: if (vdiffuw * vdiffdw > 0.0) { |
203: auw = std::fabs(vdiffuw); |
204: adw = std::fabs(vdiffdw); |
205: wind = 1.0; |
206: if (vdiffdw <= 0.0) wind = -1.0; |
207: limiter = |
208: wind * std::fmin(std::fmin(width * ((2.0 - sigma) * adw / width + (1.0 + sigma) * auw / celldy[dif]) / 6.0, auw), adw); |
209: } |
210: advec_vel_s = vel1(i, donor) + (1.0 - sigma) * limiter; |
211: mom_flux(i, j) = advec_vel_s * node_flux(i, j); |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►100.00+ | __kmp_invoke_microtask | libiomp5.so | |
○ | __kmp_invoke_task_func | libiomp5.so |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.04 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 1.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.16 |
Bottlenecks | |
Function | advec_mom_kernel(int, int, int, int, clover::Buffer2D |
Source | context.h:46-46,context.h:69-69,advec_mom.cpp:181-186,advec_mom.cpp:192-211 |
Source loop unroll info | unrolled by 8 |
Source loop unroll confidence level | high |
Unroll/vectorization loop type | main |
Unroll factor | 8 |
CQA cycles | 41.25 |
CQA cycles if no scalar integer | 39.50 |
CQA cycles if FP arith vectorized | 41.25 |
CQA cycles if fully vectorized | 41.25 |
Front-end cycles | 26.42 |
DIV/SQRT cycles | 41.00 |
P0 cycles | 5.50 |
P1 cycles | 17.17 |
P2 cycles | 17.17 |
P3 cycles | 5.00 |
P4 cycles | 41.00 |
P5 cycles | 3.00 |
P6 cycles | 5.00 |
P7 cycles | 5.00 |
P8 cycles | 5.00 |
P9 cycles | 3.00 |
P10 cycles | 17.17 |
P11 cycles | 32.00 |
Inter-iter dependencies cycles | 1 |
FE+BE cycles (UFS) | 44.73 - 150.60 |
Stall cycles (UFS) | 23.08 - 128.84 |
Nb insns | 85.50 |
Nb uops | 158.50 |
Nb loads | 9.50 |
Nb stores | 1.00 |
Nb stack references | 2.50 |
FLOP/cycle | 2.42 |
Nb FLOP add-sub | 32.00 |
Nb FLOP mul | 28.00 |
Nb FLOP fma | 12.00 |
Nb FLOP div | 16.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 11.55 |
Bytes prefetched | 0.00 |
Bytes loaded | 412.00 |
Bytes stored | 64.00 |
Stride 0 | 1.50 |
Stride 1 | 0.00 |
Stride n | 0.00 |
Stride unknown | 0.00 |
Stride indirect | 5.50 |
Vectorization ratio all | 99.35 |
Vectorization ratio load | 93.75 |
Vectorization ratio store | 100.00 |
Vectorization ratio mul | 100.00 |
Vectorization ratio add_sub | 100.00 |
Vectorization ratio fma | 100.00 |
Vectorization ratio div_sqrt | 100.00 |
Vectorization ratio other | 98.84 |
Vector-efficiency ratio all | 82.59 |
Vector-efficiency ratio load | 94.53 |
Vector-efficiency ratio store | 100.00 |
Vector-efficiency ratio mul | 100.00 |
Vector-efficiency ratio add_sub | 87.45 |
Vector-efficiency ratio fma | 100.00 |
Vector-efficiency ratio div_sqrt | 100.00 |
Vector-efficiency ratio other | 73.94 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 1.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.01 |
Bottlenecks | P0, |
Function | advec_mom_kernel(int, int, int, int, clover::Buffer2D |
Source | context.h:46-46,context.h:69-69,advec_mom.cpp:181-186,advec_mom.cpp:192-211 |
Source loop unroll info | unrolled by 8 |
Source loop unroll confidence level | high |
Unroll/vectorization loop type | main |
Unroll factor | 8 |
CQA cycles | 48.00 |
CQA cycles if no scalar integer | 48.00 |
CQA cycles if FP arith vectorized | 48.00 |
CQA cycles if fully vectorized | 48.00 |
Front-end cycles | 29.50 |
DIV/SQRT cycles | 47.50 |
P0 cycles | 6.00 |
P1 cycles | 20.33 |
P2 cycles | 20.33 |
P3 cycles | 5.00 |
P4 cycles | 47.50 |
P5 cycles | 3.00 |
P6 cycles | 5.00 |
P7 cycles | 5.00 |
P8 cycles | 5.00 |
P9 cycles | 3.00 |
P10 cycles | 20.33 |
P11 cycles | 48.00 |
Inter-iter dependencies cycles | 1 |
FE+BE cycles (UFS) | 54.22 - 212.96 |
Stall cycles (UFS) | 29.73 - 188.44 |
Nb insns | 98.00 |
Nb uops | 177.00 |
Nb loads | 12.00 |
Nb stores | 1.00 |
Nb stack references | 3.00 |
FLOP/cycle | 2.83 |
Nb FLOP add-sub | 40.00 |
Nb FLOP mul | 40.00 |
Nb FLOP fma | 16.00 |
Nb FLOP div | 24.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 11.50 |
Bytes prefetched | 0.00 |
Bytes loaded | 488.00 |
Bytes stored | 64.00 |
Stride 0 | 2.00 |
Stride 1 | 0.00 |
Stride n | 0.00 |
Stride unknown | 0.00 |
Stride indirect | 7.00 |
Vectorization ratio all | 98.70 |
Vectorization ratio load | 87.50 |
Vectorization ratio store | 100.00 |
Vectorization ratio mul | 100.00 |
Vectorization ratio add_sub | 100.00 |
Vectorization ratio fma | 100.00 |
Vectorization ratio div_sqrt | 100.00 |
Vectorization ratio other | 97.67 |
Vector-efficiency ratio all | 83.60 |
Vector-efficiency ratio load | 89.06 |
Vector-efficiency ratio store | 100.00 |
Vector-efficiency ratio mul | 100.00 |
Vector-efficiency ratio add_sub | 88.24 |
Vector-efficiency ratio fma | 100.00 |
Vector-efficiency ratio div_sqrt | 100.00 |
Vector-efficiency ratio other | 75.29 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.11 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 1.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.48 |
Bottlenecks | P0, P5, |
Function | advec_mom_kernel(int, int, int, int, clover::Buffer2D |
Source | context.h:46-46,context.h:69-69,advec_mom.cpp:181-186,advec_mom.cpp:192-211 |
Source loop unroll info | unrolled by 8 |
Source loop unroll confidence level | high |
Unroll/vectorization loop type | main |
Unroll factor | 8 |
CQA cycles | 34.50 |
CQA cycles if no scalar integer | 31.00 |
CQA cycles if FP arith vectorized | 34.50 |
CQA cycles if fully vectorized | 34.50 |
Front-end cycles | 23.33 |
DIV/SQRT cycles | 34.50 |
P0 cycles | 5.00 |
P1 cycles | 14.00 |
P2 cycles | 14.00 |
P3 cycles | 5.00 |
P4 cycles | 34.50 |
P5 cycles | 3.00 |
P6 cycles | 5.00 |
P7 cycles | 5.00 |
P8 cycles | 5.00 |
P9 cycles | 3.00 |
P10 cycles | 14.00 |
P11 cycles | 16.00 |
Inter-iter dependencies cycles | 1 |
FE+BE cycles (UFS) | 35.24 - 88.25 |
Stall cycles (UFS) | 16.44 - 69.23 |
Nb insns | 73.00 |
Nb uops | 140.00 |
Nb loads | 7.00 |
Nb stores | 1.00 |
Nb stack references | 2.00 |
FLOP/cycle | 1.86 |
Nb FLOP add-sub | 24.00 |
Nb FLOP mul | 16.00 |
Nb FLOP fma | 8.00 |
Nb FLOP div | 8.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 11.59 |
Bytes prefetched | 0.00 |
Bytes loaded | 336.00 |
Bytes stored | 64.00 |
Stride 0 | 1.00 |
Stride 1 | 0.00 |
Stride n | 0.00 |
Stride unknown | 0.00 |
Stride indirect | 4.00 |
Vectorization ratio all | 100.00 |
Vectorization ratio load | 100.00 |
Vectorization ratio store | 100.00 |
Vectorization ratio mul | 100.00 |
Vectorization ratio add_sub | 100.00 |
Vectorization ratio fma | 100.00 |
Vectorization ratio div_sqrt | 100.00 |
Vectorization ratio other | 100.00 |
Vector-efficiency ratio all | 81.58 |
Vector-efficiency ratio load | 100.00 |
Vector-efficiency ratio store | 100.00 |
Vector-efficiency ratio mul | 100.00 |
Vector-efficiency ratio add_sub | 86.67 |
Vector-efficiency ratio fma | 100.00 |
Vector-efficiency ratio div_sqrt | 100.00 |
Vector-efficiency ratio other | 72.58 |
Path / |
nb instructions | 85.50 |
nb uops | 158.50 |
loop length | 480.50 |
used x86 registers | 6 |
used mmx registers | 0 |
used xmm registers | 6 |
used ymm registers | 7 |
used zmm registers | 22.50 |
nb stack references | 2.50 |
ADD-SUB / MUL ratio | 1.25 |
micro-operation queue | 26.42 cycles |
front end | 26.42 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 41.00 | 5.00 | 17.17 | 17.17 | 5.00 | 41.00 | 3.00 | 5.00 | 5.00 | 5.00 | 3.00 | 17.17 |
cycles | 41.00 | 5.50 | 17.17 | 17.17 | 5.00 | 41.00 | 3.00 | 5.00 | 5.00 | 5.00 | 3.00 | 17.17 |
Cycles executing div or sqrt instructions | 32.00 |
Longest recurrence chain latency (RecMII) | 1.00 |
FE+BE cycles | 44.73-150.61 |
Stall cycles | 23.08-128.84 |
ROB full (events) | 3.35-132.63 |
RS full (events) | 37.86-1.07 |
Front-end | 26.42 |
Dispatch | 41.00 |
DIV/SQRT | 32.00 |
Data deps. | 1.00 |
Overall L1 | 41.25 |
all | 100% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 100% |
all | 98% |
load | 93% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 97% |
all | 99% |
load | 93% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 98% |
all | 77% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 83% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 66% |
all | 89% |
load | 94% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 82% |
all | 82% |
load | 94% |
store | 100% |
mul | 100% |
add-sub | 87% |
fma | 100% |
div/sqrt | 100% |
other | 73% |
nb instructions | 98 |
nb uops | 177 |
loop length | 555 |
used x86 registers | 6 |
used mmx registers | 0 |
used xmm registers | 6 |
used ymm registers | 7 |
used zmm registers | 24 |
nb stack references | 3 |
ADD-SUB / MUL ratio | 1.00 |
micro-operation queue | 29.50 cycles |
front end | 29.50 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 47.50 | 5.00 | 20.33 | 20.33 | 5.00 | 47.50 | 3.00 | 5.00 | 5.00 | 5.00 | 3.00 | 20.33 |
cycles | 47.50 | 6.00 | 20.33 | 20.33 | 5.00 | 47.50 | 3.00 | 5.00 | 5.00 | 5.00 | 3.00 | 20.33 |
Cycles executing div or sqrt instructions | 48.00 |
Longest recurrence chain latency (RecMII) | 1.00 |
FE+BE cycles | 54.22-212.96 |
Stall cycles | 29.73-188.45 |
ROB full (events) | 6.69-193.43 |
RS full (events) | 43.33-1.52 |
Front-end | 29.50 |
Dispatch | 47.50 |
DIV/SQRT | 48.00 |
Data deps. | 1.00 |
Overall L1 | 48.00 |
all | 100% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 100% |
all | 97% |
load | 87% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 95% |
all | 98% |
load | 87% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 97% |
all | 76% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 83% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 66% |
all | 90% |
load | 89% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 83% |
all | 83% |
load | 89% |
store | 100% |
mul | 100% |
add-sub | 88% |
fma | 100% |
div/sqrt | 100% |
other | 75% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
VMOVAPD %ZMM9,%ZMM5{%K2}{z} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VSUBPD %ZMM3,%ZMM27,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VFMADD213PD %ZMM4,%ZMM5,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM2,%ZMM3,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VPMULLQ %ZMM1,%ZMM23,%ZMM1 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM0,%ZMM1,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
KXNORW %K0,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VSCATTERQPD %ZMM2,(%RDI,%ZMM0,8){%K1} | 20 | 2.20 | 0.20 | 0 | 0 | 4 | 0.20 | 0.20 | 4 | 4 | 4 | 0.20 | 0 | 2-12 | 7 |
VPADDQ %ZMM28,%ZMM17,%ZMM17 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
ADD $0x8,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CMP %R13,%RBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JAE 420c5a <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii.extracted.7+0x3da> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
VMOVDQA64 %ZMM17,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVDQA64 %ZMM16,%ZMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
MOV $0x451520,%RAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CALL %RAX | 2 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 2.14 |
VPMOVQD %ZMM0,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPADDD %YMM0,%YMM18,%YMM29 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VMOVDQA64 %ZMM17,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVDQA64 %ZMM16,%ZMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
CALL 4513a0 <__svml_i64rem8_z0> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
VPADDQ %ZMM19,%ZMM0,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPSLLQ $0x20,%ZMM0,%ZMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2-4 | 1 |
VPSRAQ $0x20,%ZMM0,%ZMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2-4 | 1 |
VPMOVSXDQ %YMM29,%ZMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPXOR %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VPMULLQ %ZMM1,%ZMM20,%ZMM2 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM0,%ZMM2,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPXOR %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
KXNORW %K0,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
MOV -0x60(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VGATHERQPD (%RAX,%ZMM3,8),%ZMM2{%K1} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VFPCLASSPD $0x50,%ZMM2,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPCMPEQD %YMM6,%YMM6,%YMM6 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VPSUBD %YMM6,%YMM29,%YMM5 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.33 |
VPMOVSXDQ %YMM5,%ZMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPBLENDMQ %ZMM7,%ZMM1,%ZMM4{%K1} | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPXOR %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VPMULLQ %ZMM4,%ZMM21,%ZMM3 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM0,%ZMM3,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VXORPD %XMM8,%XMM8,%XMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
MOV -0x58(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VGATHERQPD (%RAX,%ZMM3,8),%ZMM8{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VPADDD %YMM6,%YMM29,%YMM6 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VMOVDQA %YMM6,%YMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VANDPD %ZMM25,%ZMM2,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VDIVPD %ZMM8,%ZMM3,%ZMM3 | 3 | 2.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 22-24 | 16 |
VPADDD %YMM24,%YMM29,%YMM9{%K1} | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VPMULLQ %ZMM4,%ZMM22,%ZMM4 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM0,%ZMM4,%ZMM8 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPXOR %XMM4,%XMM4,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERQPD (%R15,%ZMM8,8),%ZMM4{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VPMOVSXDQ %YMM9,%ZMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPMULLQ %ZMM8,%ZMM22,%ZMM8 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM0,%ZMM8,%ZMM8 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPXOR %XMM9,%XMM9,%XMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERQPD (%R15,%ZMM8,8),%ZMM9{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VMOVDQA64 %ZMM1,%ZMM7{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VPMULLQ %ZMM7,%ZMM22,%ZMM7 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM0,%ZMM7,%ZMM7 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VXORPD %XMM10,%XMM10,%XMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERQPD (%R15,%ZMM7,8),%ZMM10{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VSUBPD %ZMM9,%ZMM4,%ZMM8 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VSUBPD %ZMM4,%ZMM10,%ZMM7 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMULPD %ZMM8,%ZMM7,%ZMM9 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM9,%ZMM26,%K2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
KORTESTB %K2,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
VXORPD %XMM9,%XMM9,%XMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JE 420a20 <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii.extracted.7+0x1a0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV -0x50(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x8(%RAX),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
KMOVQ %K2,%K3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
VGATHERQPD (%RAX,%ZMM1,8),%ZMM9{%K3} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VMOVDQA32 %YMM5,%YMM6{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VANDPD %ZMM25,%ZMM8,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VXORPD %XMM8,%XMM8,%XMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
KMOVQ %K2,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
VGATHERDPD (%RAX,%YMM6,8),%ZMM8{%K1} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VANDPD %ZMM25,%ZMM7,%ZMM6 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VBROADCASTSD 0x43ac1(%RIP),%ZMM10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 3 | 0.33 |
VSUBPD %ZMM3,%ZMM10,%ZMM10 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMULPD %ZMM10,%ZMM6,%ZMM10 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VDIVPD %ZMM9,%ZMM10,%ZMM10 | 3 | 2.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 22-24 | 16 |
VCMPPD $0x1,%ZMM7,%ZMM26,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINPD %ZMM6,%ZMM5,%ZMM6 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM5,%ZMM3,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VDIVPD %ZMM8,%ZMM5,%ZMM5 | 3 | 2.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 22-24 | 16 |
VADDPD %ZMM10,%ZMM5,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMULPD %ZMM30,%ZMM9,%ZMM7 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM5,%ZMM7,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMINPD %ZMM6,%ZMM5,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VXORPD %ZMM31,%ZMM5,%ZMM9 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VMOVAPD %ZMM5,%ZMM9{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
JMP 420a20 <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii.extracted.7+0x1a0> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
nb instructions | 73 |
nb uops | 140 |
loop length | 406 |
used x86 registers | 6 |
used mmx registers | 0 |
used xmm registers | 6 |
used ymm registers | 7 |
used zmm registers | 21 |
nb stack references | 2 |
ADD-SUB / MUL ratio | 1.50 |
micro-operation queue | 23.33 cycles |
front end | 23.33 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 34.50 | 5.00 | 14.00 | 14.00 | 5.00 | 34.50 | 3.00 | 5.00 | 5.00 | 5.00 | 3.00 | 14.00 |
cycles | 34.50 | 5.00 | 14.00 | 14.00 | 5.00 | 34.50 | 3.00 | 5.00 | 5.00 | 5.00 | 3.00 | 14.00 |
Cycles executing div or sqrt instructions | 16.00 |
Longest recurrence chain latency (RecMII) | 1.00 |
FE+BE cycles | 35.24-88.25 |
Stall cycles | 16.44-69.23 |
RS full (events) | 32.39-0.62 |
Front-end | 23.33 |
Dispatch | 34.50 |
DIV/SQRT | 16.00 |
Data deps. | 1.00 |
Overall L1 | 34.50 |
all | 100% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 100% |
all | 100% |
load | 100% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 100% |
all | 100% |
load | 100% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 100% |
all | 77% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 83% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 67% |
all | 88% |
load | 100% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 81% |
all | 81% |
load | 100% |
store | 100% |
mul | 100% |
add-sub | 86% |
fma | 100% |
div/sqrt | 100% |
other | 72% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
VMOVAPD %ZMM9,%ZMM5{%K2}{z} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VSUBPD %ZMM3,%ZMM27,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VFMADD213PD %ZMM4,%ZMM5,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM2,%ZMM3,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VPMULLQ %ZMM1,%ZMM23,%ZMM1 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM0,%ZMM1,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
KXNORW %K0,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VSCATTERQPD %ZMM2,(%RDI,%ZMM0,8){%K1} | 20 | 2.20 | 0.20 | 0 | 0 | 4 | 0.20 | 0.20 | 4 | 4 | 4 | 0.20 | 0 | 2-12 | 7 |
VPADDQ %ZMM28,%ZMM17,%ZMM17 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
ADD $0x8,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CMP %R13,%RBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JAE 420c5a <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii.extracted.7+0x3da> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
VMOVDQA64 %ZMM17,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVDQA64 %ZMM16,%ZMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
MOV $0x451520,%RAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CALL %RAX | 2 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 2.14 |
VPMOVQD %ZMM0,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPADDD %YMM0,%YMM18,%YMM29 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VMOVDQA64 %ZMM17,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVDQA64 %ZMM16,%ZMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
CALL 4513a0 <__svml_i64rem8_z0> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
VPADDQ %ZMM19,%ZMM0,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPSLLQ $0x20,%ZMM0,%ZMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2-4 | 1 |
VPSRAQ $0x20,%ZMM0,%ZMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2-4 | 1 |
VPMOVSXDQ %YMM29,%ZMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPXOR %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VPMULLQ %ZMM1,%ZMM20,%ZMM2 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM0,%ZMM2,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPXOR %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
KXNORW %K0,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
MOV -0x60(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VGATHERQPD (%RAX,%ZMM3,8),%ZMM2{%K1} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VFPCLASSPD $0x50,%ZMM2,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPCMPEQD %YMM6,%YMM6,%YMM6 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VPSUBD %YMM6,%YMM29,%YMM5 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.33 |
VPMOVSXDQ %YMM5,%ZMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPBLENDMQ %ZMM7,%ZMM1,%ZMM4{%K1} | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPXOR %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VPMULLQ %ZMM4,%ZMM21,%ZMM3 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM0,%ZMM3,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VXORPD %XMM8,%XMM8,%XMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
MOV -0x58(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VGATHERQPD (%RAX,%ZMM3,8),%ZMM8{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VPADDD %YMM6,%YMM29,%YMM6 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VMOVDQA %YMM6,%YMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VANDPD %ZMM25,%ZMM2,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VDIVPD %ZMM8,%ZMM3,%ZMM3 | 3 | 2.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 22-24 | 16 |
VPADDD %YMM24,%YMM29,%YMM9{%K1} | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VPMULLQ %ZMM4,%ZMM22,%ZMM4 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM0,%ZMM4,%ZMM8 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPXOR %XMM4,%XMM4,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERQPD (%R15,%ZMM8,8),%ZMM4{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VPMOVSXDQ %YMM9,%ZMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPMULLQ %ZMM8,%ZMM22,%ZMM8 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM0,%ZMM8,%ZMM8 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPXOR %XMM9,%XMM9,%XMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERQPD (%R15,%ZMM8,8),%ZMM9{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VMOVDQA64 %ZMM1,%ZMM7{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VPMULLQ %ZMM7,%ZMM22,%ZMM7 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM0,%ZMM7,%ZMM7 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VXORPD %XMM10,%XMM10,%XMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERQPD (%R15,%ZMM7,8),%ZMM10{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VSUBPD %ZMM9,%ZMM4,%ZMM8 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VSUBPD %ZMM4,%ZMM10,%ZMM7 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMULPD %ZMM8,%ZMM7,%ZMM9 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM9,%ZMM26,%K2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
KORTESTB %K2,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
VXORPD %XMM9,%XMM9,%XMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JE 420a20 <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii.extracted.7+0x1a0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |