Loop Id: 180 | Module: exec | Source: advec_mom_kernel.f90:81-241 [...] | Coverage: 0.01% |
---|
Loop Id: 180 | Module: exec | Source: advec_mom_kernel.f90:81-241 [...] | Coverage: 0.01% |
---|
0x435440 VMOVDQA32 %YMM26,%YMM23{%K2} |
0x435446 MOV 0x168(%RSP),%R15 |
0x43544e VMOVSD (%R15,%RDX,8),%XMM4 |
0x435454 VANDPD %ZMM7,%ZMM5,%ZMM5 |
0x43545a VANDPD %ZMM7,%ZMM27,%ZMM24 |
0x435460 VSUBPD %ZMM25,%ZMM12,%ZMM6 |
0x435466 VMULPD %ZMM6,%ZMM24,%ZMM6 |
0x43546c VDIVSD %XMM4,%XMM13,%XMM17 |
0x435472 VBROADCASTSD %XMM17,%ZMM17 |
0x435478 VCMPPD $0x2,%ZMM24,%ZMM5,%K2 |
0x43547f VMOVAPD %ZMM5,%ZMM24{%K2} |
0x435485 VFMADD213PD %ZMM5,%ZMM25,%ZMM5 |
0x43548b VPMOVSXDQ %YMM23,%ZMM18 |
0x435491 VPSUBQ %ZMM0,%ZMM18,%ZMM18 |
0x435497 VXORPD %XMM19,%XMM19,%XMM19 |
0x43549d KMOVQ %K3,%K2 |
0x4354a2 VGATHERQPD (%R15,%ZMM18,8),%ZMM19{%K2} |
0x4354a9 VMOVAPD 0x300(%RSP),%ZMM18 |
0x4354b1 VMOVAPD %ZMM19,%ZMM18{%K3} |
0x4354b7 VMOVAPD %ZMM18,0x300(%RSP) |
0x4354bf VDIVPD %ZMM18,%ZMM5,%ZMM5 |
0x4354c5 VFMADD231PD %ZMM17,%ZMM6,%ZMM5 |
0x4354cb VMULSD %XMM4,%XMM14,%XMM4 |
0x4354cf VBROADCASTSD %XMM4,%ZMM4 |
0x4354d5 VMULPD %ZMM5,%ZMM4,%ZMM4 |
0x4354db VCMPPD $0x2,%ZMM24,%ZMM4,%K2 |
0x4354e2 VMOVAPD %ZMM4,%ZMM24{%K2} |
0x4354e8 VFPCLASSPD $0x56,%ZMM27,%K2 |
0x4354ef VXORPD %ZMM15,%ZMM24,%ZMM24{%K2} |
0x4354f5 VMOVAPD %ZMM24,%ZMM4{%K3}{z} |
0x4354fb VSUBPD %ZMM25,%ZMM10,%ZMM5 |
0x435501 VFMADD213PD %ZMM20,%ZMM4,%ZMM5 |
0x435507 VMULPD %ZMM22,%ZMM5,%ZMM4 |
0x43550d IMUL %R8,%RDX |
0x435511 ADD 0xd8(%RSP),%RDX |
0x435519 VMOVUPD %ZMM4,(%RDX,%RSI,8){%K1} |
0x435520 ADD %R8,%R14 |
0x435523 ADD %R11,%R13 |
0x435526 CMP 0x38(%RSP),%R12 |
0x43552b LEA 0x1(%R12),%R12 |
0x435530 JE 43358b |
0x435536 TEST %R9,%R9 |
0x435539 JE 4357c0 |
0x43553f MOV 0x30(%RSP),%RDX |
0x435544 LEA -0x1(%RDX,%R12,1),%RDX |
0x435549 VPBROADCASTD %EDX,%YMM23 |
0x43554f SUB %RAX,%RDX |
0x435552 LEA (%RBX,%R12,1),%ESI |
0x435556 LEA 0x1(%RBX,%R12,1),%EDI |
0x43555b VPBROADCASTD %EDI,%YMM24 |
0x435561 LEA -0x2(%RBX,%R12,1),%EDI |
0x435566 VPBROADCASTD %EDI,%YMM25 |
0x43556c VPBROADCASTD %ESI,%YMM26 |
0x435572 XOR %ESI,%ESI |
0x435574 JMP 4355b3 |
(181) 0x435580 VCMPPD $0x1,%ZMM30,%ZMM11,%K1 |
(181) 0x435587 VMOVAPD %ZMM6,%ZMM4{%K1}{z} |
(181) 0x43558d VSUBPD %ZMM29,%ZMM10,%ZMM5 |
(181) 0x435593 VFMADD213PD %ZMM28,%ZMM4,%ZMM5 |
(181) 0x435599 VMULPD %ZMM27,%ZMM5,%ZMM4 |
(181) 0x43559f VMOVUPD %ZMM4,(%R14,%RSI,8) |
(181) 0x4355a6 ADD $0x8,%RSI |
(181) 0x4355aa CMP %R9,%RSI |
(181) 0x4355ad JGE 435780 |
(181) 0x4355b3 LEA (%RCX,%RSI,1),%RDI |
(181) 0x4355b7 VMOVUPD (%R13,%RSI,8),%ZMM27 |
(181) 0x4355bf VFPCLASSPD $0x50,%ZMM27,%K1 |
(181) 0x4355c6 VPBLENDMD %YMM26,%YMM23,%YMM5{%K1} |
(181) 0x4355cc VPMOVSXDQ %YMM5,%ZMM5 |
(181) 0x4355d2 VPSUBQ %ZMM0,%ZMM5,%ZMM5 |
(181) 0x4355d8 VPXOR %XMM6,%XMM6,%XMM6 |
(181) 0x4355dc VPMULLQ %ZMM5,%ZMM1,%ZMM6 |
(181) 0x4355e2 VPBROADCASTQ %RDI,%ZMM28 |
(181) 0x4355e8 VPSUBQ %ZMM2,%ZMM28,%ZMM28 |
(181) 0x4355ee VPSLLQ $0x3,%ZMM28,%ZMM28 |
(181) 0x4355f5 VPADDQ %ZMM16,%ZMM28,%ZMM28 |
(181) 0x4355fb VPADDQ %ZMM28,%ZMM8,%ZMM29 |
(181) 0x435601 VPADDQ %ZMM6,%ZMM29,%ZMM6 |
(181) 0x435607 VPXORD %XMM29,%XMM29,%XMM29 |
(181) 0x43560d KXNORW %K0,%K0,%K2 |
(181) 0x435611 VGATHERQPD (,%ZMM6,1),%ZMM29{%K2} |
(181) 0x43561c VPMULLQ %ZMM5,%ZMM3,%ZMM5 |
(181) 0x435622 VPADDQ %ZMM28,%ZMM9,%ZMM6 |
(181) 0x435628 VPADDQ %ZMM5,%ZMM6,%ZMM5 |
(181) 0x43562e VPXORD %XMM28,%XMM28,%XMM28 |
(181) 0x435634 KXNORW %K0,%K0,%K2 |
(181) 0x435638 VGATHERQPD (,%ZMM5,1),%ZMM28{%K2} |
(181) 0x435643 VPBLENDMD %YMM24,%YMM25,%YMM5{%K1} |
(181) 0x435649 VPMOVSXDQ %YMM5,%ZMM5 |
(181) 0x43564f VPSUBQ %ZMM0,%ZMM5,%ZMM5 |
(181) 0x435655 VPMULLQ %ZMM5,%ZMM3,%ZMM5 |
(181) 0x43565b VPADDQ %ZMM5,%ZMM6,%ZMM5 |
(181) 0x435661 VXORPD %XMM30,%XMM30,%XMM30 |
(181) 0x435667 KXNORW %K0,%K0,%K2 |
(181) 0x43566b VGATHERQPD (,%ZMM5,1),%ZMM30{%K2} |
(181) 0x435676 VPBLENDMD %YMM23,%YMM26,%YMM5{%K1} |
(181) 0x43567c VANDPD %ZMM7,%ZMM27,%ZMM31 |
(181) 0x435682 VDIVPD %ZMM29,%ZMM31,%ZMM29 |
(181) 0x435688 VPMOVSXDQ %YMM5,%ZMM5 |
(181) 0x43568e VPSUBQ %ZMM0,%ZMM5,%ZMM5 |
(181) 0x435694 VPMULLQ %ZMM5,%ZMM3,%ZMM5 |
(181) 0x43569a VPADDQ %ZMM5,%ZMM6,%ZMM5 |
(181) 0x4356a0 VXORPD %XMM31,%XMM31,%XMM31 |
(181) 0x4356a6 KXNORW %K0,%K0,%K2 |
(181) 0x4356aa VGATHERQPD (,%ZMM5,1),%ZMM31{%K2} |
(181) 0x4356b5 VPXOR %XMM6,%XMM6,%XMM6 |
(181) 0x4356b9 VSUBPD %ZMM30,%ZMM28,%ZMM5 |
(181) 0x4356bf VSUBPD %ZMM28,%ZMM31,%ZMM31 |
(181) 0x4356c5 VMULPD %ZMM5,%ZMM31,%ZMM30 |
(181) 0x4356cb VCMPPD $0x1,%ZMM30,%ZMM6,%K0 |
(181) 0x4356d2 KORTESTB %K0,%K0 |
(181) 0x4356d6 JE 435580 |
(181) 0x4356dc VCMPPD $0x1,%ZMM30,%ZMM11,%K2 |
(181) 0x4356e3 VPBLENDMD %YMM26,%YMM25,%YMM4{%K1} |
(181) 0x4356e9 VMOVSD (%R15,%RDX,8),%XMM17 |
(181) 0x4356f0 VANDPD %ZMM7,%ZMM5,%ZMM5 |
(181) 0x4356f6 VANDPD %ZMM7,%ZMM31,%ZMM6 |
(181) 0x4356fc VSUBPD %ZMM29,%ZMM12,%ZMM18 |
(181) 0x435702 VMULPD %ZMM18,%ZMM6,%ZMM18 |
(181) 0x435708 VDIVSD %XMM17,%XMM13,%XMM19 |
(181) 0x43570e VBROADCASTSD %XMM19,%ZMM19 |
(181) 0x435714 VCMPPD $0x2,%ZMM6,%ZMM5,%K1 |
(181) 0x43571b VMOVAPD %ZMM5,%ZMM6{%K1} |
(181) 0x435721 VPMOVSXDQ %YMM4,%ZMM4 |
(181) 0x435727 VPSUBQ %ZMM0,%ZMM4,%ZMM4 |
(181) 0x43572d VXORPD %XMM21,%XMM21,%XMM21 |
(181) 0x435733 VGATHERQPD (%R15,%ZMM4,8),%ZMM21{%K2} |
(181) 0x43573a VFMADD213PD %ZMM5,%ZMM29,%ZMM5 |
(181) 0x435740 VDIVPD %ZMM21,%ZMM5,%ZMM4 |
(181) 0x435746 VFMADD231PD %ZMM19,%ZMM18,%ZMM4 |
(181) 0x43574c VMULSD %XMM14,%XMM17,%XMM5 |
(181) 0x435752 VBROADCASTSD %XMM5,%ZMM5 |
(181) 0x435758 VMULPD %ZMM4,%ZMM5,%ZMM4 |
(181) 0x43575e VCMPPD $0x2,%ZMM6,%ZMM4,%K1 |
(181) 0x435765 VMOVAPD %ZMM4,%ZMM6{%K1} |
(181) 0x43576b VFPCLASSPD $0x56,%ZMM31,%K1 |
(181) 0x435772 VXORPD %ZMM15,%ZMM6,%ZMM6{%K1} |
(181) 0x435778 JMP 435580 |
0x435780 MOV %R9,%RSI |
0x435783 CMP %R9,0x120(%RSP) |
0x43578b JE 435520 |
0x435791 JMP 4357c2 |
0x4357c0 XOR %ESI,%ESI |
0x4357c2 VPBROADCASTQ %RSI,%ZMM25 |
0x4357c8 VMOVDQA64 0x400(%RSP),%ZMM4 |
0x4357d0 VPSUBQ %ZMM25,%ZMM4,%ZMM5 |
0x4357d6 VPCMPNLEUQ 0xd3f1f(%RIP),%ZMM5,%K1 |
0x4357e1 KORTESTB %K1,%K1 |
0x4357e5 JE 435520 |
0x4357eb MOV 0x30(%RSP),%RDX |
0x4357f0 LEA -0x1(%RDX,%R12,1),%R15 |
0x4357f5 MOV %R15,%RDX |
0x4357f8 SUB %RAX,%RDX |
0x4357fb MOV %R11,%RDI |
0x4357fe IMUL %RDX,%RDI |
0x435802 ADD 0x50(%RSP),%RDI |
0x435807 ADD %RCX,%RSI |
0x43580a SUB %R10,%RSI |
0x43580d VMOVUPD (%RDI,%RSI,8),%ZMM4{%K1}{z} |
0x435814 VMOVAPD %ZMM4,%ZMM22{%K1} |
0x43581a LEA -0x2(%RBX,%R12,1),%EDI |
0x43581f VPBROADCASTD %EDI,%YMM23 |
0x435825 LEA 0x1(%RBX,%R12,1),%EDI |
0x43582a VFPCLASSPD $0x50,%ZMM22,%K2 |
0x435831 VMOVDQA64 %YMM23,%YMM4 |
0x435837 VPBROADCASTD %EDI,%YMM4{%K2} |
0x43583d LEA (%RBX,%R12,1),%EDI |
0x435841 VPBROADCASTD %EDI,%YMM26 |
0x435847 VPBROADCASTD %R15D,%YMM5 |
0x43584d VPBROADCASTD %EDI,%YMM5{%K2} |
0x435853 VMOVDQA64 %YMM26,%YMM6 |
0x435859 VPBROADCASTD %R15D,%YMM6{%K2} |
0x43585f VPMOVSXDQ %YMM5,%ZMM5 |
0x435865 VPSUBQ %ZMM0,%ZMM5,%ZMM5 |
0x43586b VPMULLQ %ZMM5,%ZMM1,%ZMM17 |
0x435871 VPXORD %XMM24,%XMM24,%XMM24 |
0x435877 VPADDQ 0x2c0(%RSP),%ZMM25,%ZMM18 |
0x43587f VPSUBQ %ZMM2,%ZMM18,%ZMM18 |
0x435885 VPSLLQ $0x3,%ZMM18,%ZMM18 |
0x43588c VPADDQ %ZMM18,%ZMM8,%ZMM19 |
0x435892 VPADDQ %ZMM17,%ZMM19,%ZMM17 |
0x435898 KMOVQ %K1,%K3 |
0x43589d VPXORD %XMM19,%XMM19,%XMM19 |
0x4358a3 VGATHERQPD (,%ZMM17,1),%ZMM19{%K3} |
0x4358ae VANDPD %ZMM7,%ZMM22,%ZMM17 |
0x4358b4 VPMULLQ %ZMM5,%ZMM3,%ZMM5 |
0x4358ba VPADDQ %ZMM18,%ZMM9,%ZMM18 |
0x4358c0 VPADDQ %ZMM5,%ZMM18,%ZMM5 |
0x4358c6 KMOVQ %K1,%K3 |
0x4358cb VXORPD %XMM21,%XMM21,%XMM21 |
0x4358d1 VGATHERQPD (,%ZMM5,1),%ZMM21{%K3} |
0x4358dc VPMOVSXDQ %YMM4,%ZMM4 |
0x4358e2 VPSUBQ %ZMM0,%ZMM4,%ZMM4 |
0x4358e8 VPMULLQ %ZMM4,%ZMM3,%ZMM4 |
0x4358ee VPADDQ %ZMM4,%ZMM18,%ZMM4 |
0x4358f4 KMOVQ %K1,%K3 |
0x4358f9 VXORPD %XMM5,%XMM5,%XMM5 |
0x4358fd VGATHERQPD (,%ZMM4,1),%ZMM5{%K3} |
0x435908 VMOVAPD 0x340(%RSP),%ZMM4 |
0x435910 VMOVAPD %ZMM19,%ZMM4{%K1} |
0x435916 VMOVAPD %ZMM4,%ZMM19 |
0x43591c VDIVPD %ZMM4,%ZMM17,%ZMM25 |
0x435922 VMOVAPD %ZMM21,%ZMM20{%K1} |
0x435928 VMOVAPD 0x280(%RSP),%ZMM17 |
0x435930 VMOVAPD %ZMM5,%ZMM17{%K1} |
0x435936 VPMOVSXDQ %YMM6,%ZMM4 |
0x43593c VPSUBQ %ZMM0,%ZMM4,%ZMM4 |
0x435942 VPMULLQ %ZMM4,%ZMM3,%ZMM4 |
0x435948 VPADDQ %ZMM4,%ZMM18,%ZMM4 |
0x43594e KMOVQ %K1,%K3 |
0x435953 VPXOR %XMM6,%XMM6,%XMM6 |
0x435957 VGATHERQPD (,%ZMM4,1),%ZMM6{%K3} |
0x435962 VSUBPD %ZMM17,%ZMM20,%ZMM5 |
0x435968 VMOVAPD 0x3c0(%RSP),%ZMM4 |
0x435970 VMOVAPD %ZMM6,%ZMM4{%K1} |
0x435976 VMOVAPD %ZMM4,%ZMM6 |
0x43597c VSUBPD %ZMM20,%ZMM4,%ZMM27 |
0x435982 VMULPD %ZMM5,%ZMM27,%ZMM4 |
0x435988 VCMPPD $0x1,%ZMM4,%ZMM24,%K3{%K1} |
0x43598f KORTESTB %K3,%K3 |
0x435993 VMOVAPD %ZMM6,0x3c0(%RSP) |
0x43599b VMOVAPD %ZMM17,0x280(%RSP) |
0x4359a3 VMOVAPD %ZMM19,0x340(%RSP) |
0x4359ab JNE 435440 |
0x4359b1 MOV 0x168(%RSP),%R15 |
0x4359b9 JMP 4354f5 |
/scratch_na/users/xoserete/qaas_runs/171-215-0463/intel/CloverLeafFC/build/CloverLeafFC/CloverLeaf_ref/kernels/advec_mom_kernel.f90: 81 - 241 |
-------------------------------------------------------------------------------- |
81: IF(mom_sweep.EQ.1)THEN ! x 1 |
[...] |
213: DO k=y_min-1,y_max+1 |
214: DO j=x_min,x_max+1 |
215: IF(node_flux(j,k).LT.0.0)THEN |
[...] |
227: sigma=ABS(node_flux(j,k))/(node_mass_pre(j,donor)) |
228: width=celldy(k) |
229: vdiffuw=vel1(j,donor)-vel1(j,upwind) |
230: vdiffdw=vel1(j,downwind)-vel1(j,donor) |
231: limiter=0.0 |
232: IF(vdiffuw*vdiffdw.GT.0.0)THEN |
233: auw=ABS(vdiffuw) |
234: adw=ABS(vdiffdw) |
235: wind=1.0_8 |
236: IF(vdiffdw.LE.0.0) wind=-1.0_8 |
237: limiter=wind*MIN(width*((2.0_8-sigma)*adw/width+(1.0_8+sigma)*auw/celldy(dif))/6.0_8,auw,adw) |
238: ENDIF |
239: advec_vel_s=vel1(j,donor)+(1.0_8-sigma)*limiter |
240: mom_flux(j,k)=advec_vel_s*node_flux(j,k) |
241: ENDDO |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.08 |
CQA speedup if FP arith vectorized | 1.01 |
CQA speedup if fully vectorized | 1.16 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | NA |
Bottlenecks | P0, P5, |
Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
Source | advec_mom_kernel.f90:81-81,advec_mom_kernel.f90:213-215,advec_mom_kernel.f90:227-234,advec_mom_kernel.f90:237-240 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 46.00 |
CQA cycles if no scalar integer | 42.50 |
CQA cycles if FP arith vectorized | 45.63 |
CQA cycles if fully vectorized | 39.63 |
Front-end cycles | 30.33 |
DIV/SQRT cycles | 46.00 |
P0 cycles | 11.00 |
P1 cycles | 18.67 |
P2 cycles | 18.67 |
P3 cycles | 2.50 |
P4 cycles | 46.00 |
P5 cycles | 9.00 |
P6 cycles | 2.50 |
P7 cycles | 2.50 |
P8 cycles | 2.50 |
P9 cycles | 9.00 |
P10 cycles | 18.67 |
P11 cycles | 36.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | 47.03 - 89.14 |
Stall cycles (UFS) | 18.90 - 60.97 |
Nb insns | 143.00 |
Nb uops | 182.00 |
Nb loads | 22.00 |
Nb stores | 5.00 |
Nb stack references | 12.00 |
FLOP/cycle | 2.83 |
Nb FLOP add-sub | 32.00 |
Nb FLOP mul | 33.00 |
Nb FLOP fma | 24.00 |
Nb FLOP div | 17.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 26.61 |
Bytes prefetched | 0.00 |
Bytes loaded | 904.00 |
Bytes stored | 320.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 79.41 |
Vectorization ratio load | 81.25 |
Vectorization ratio store | 100.00 |
Vectorization ratio mul | 88.89 |
Vectorization ratio add_sub | 89.47 |
Vectorization ratio fma | 100.00 |
Vectorization ratio div_sqrt | 66.67 |
Vectorization ratio other | 71.43 |
Vector-efficiency ratio all | 73.53 |
Vector-efficiency ratio load | 83.59 |
Vector-efficiency ratio store | 100.00 |
Vector-efficiency ratio mul | 90.28 |
Vector-efficiency ratio add_sub | 90.79 |
Vector-efficiency ratio fma | 100.00 |
Vector-efficiency ratio div_sqrt | 70.83 |
Vector-efficiency ratio other | 59.60 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.08 |
CQA speedup if FP arith vectorized | 1.01 |
CQA speedup if fully vectorized | 1.16 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | NA |
Bottlenecks | P0, P5, |
Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
Source | advec_mom_kernel.f90:81-81,advec_mom_kernel.f90:213-215,advec_mom_kernel.f90:227-234,advec_mom_kernel.f90:237-240 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 46.00 |
CQA cycles if no scalar integer | 42.50 |
CQA cycles if FP arith vectorized | 45.63 |
CQA cycles if fully vectorized | 39.63 |
Front-end cycles | 30.33 |
DIV/SQRT cycles | 46.00 |
P0 cycles | 11.00 |
P1 cycles | 18.67 |
P2 cycles | 18.67 |
P3 cycles | 2.50 |
P4 cycles | 46.00 |
P5 cycles | 9.00 |
P6 cycles | 2.50 |
P7 cycles | 2.50 |
P8 cycles | 2.50 |
P9 cycles | 9.00 |
P10 cycles | 18.67 |
P11 cycles | 36.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | 47.03 - 89.14 |
Stall cycles (UFS) | 18.90 - 60.97 |
Nb insns | 143.00 |
Nb uops | 182.00 |
Nb loads | 22.00 |
Nb stores | 5.00 |
Nb stack references | 12.00 |
FLOP/cycle | 2.83 |
Nb FLOP add-sub | 32.00 |
Nb FLOP mul | 33.00 |
Nb FLOP fma | 24.00 |
Nb FLOP div | 17.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 26.61 |
Bytes prefetched | 0.00 |
Bytes loaded | 904.00 |
Bytes stored | 320.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 79.41 |
Vectorization ratio load | 81.25 |
Vectorization ratio store | 100.00 |
Vectorization ratio mul | 88.89 |
Vectorization ratio add_sub | 89.47 |
Vectorization ratio fma | 100.00 |
Vectorization ratio div_sqrt | 66.67 |
Vectorization ratio other | 71.43 |
Vector-efficiency ratio all | 73.53 |
Vector-efficiency ratio load | 83.59 |
Vector-efficiency ratio store | 100.00 |
Vector-efficiency ratio mul | 90.28 |
Vector-efficiency ratio add_sub | 90.79 |
Vector-efficiency ratio fma | 100.00 |
Vector-efficiency ratio div_sqrt | 70.83 |
Vector-efficiency ratio other | 59.60 |
Path / |
Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
Source file and lines | advec_mom_kernel.f90:81-241 |
Module | exec |
nb instructions | 143 |
nb uops | 182 |
loop length | 839 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 9 |
used ymm registers | 7 |
used zmm registers | 22 |
nb stack references | 12 |
ADD-SUB / MUL ratio | 0.80 |
micro-operation queue | 30.33 cycles |
front end | 30.33 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 46.00 | 11.00 | 18.67 | 18.67 | 2.50 | 46.00 | 9.00 | 2.50 | 2.50 | 2.50 | 9.00 | 18.67 |
cycles | 46.00 | 11.00 | 18.67 | 18.67 | 2.50 | 46.00 | 9.00 | 2.50 | 2.50 | 2.50 | 9.00 | 18.67 |
Cycles executing div or sqrt instructions | 36.00 |
FE+BE cycles | 47.03-89.14 |
Stall cycles | 18.90-60.96 |
ROB full (events) | 21.13-64.30 |
RS full (events) | 0.14-0.00 |
Front-end | 30.33 |
Dispatch | 46.00 |
DIV/SQRT | 36.00 |
Overall L1 | 46.00 |
all | 65% |
load | 60% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 86% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 46% |
all | 91% |
load | 90% |
store | 100% |
mul | 80% |
add-sub | 100% |
fma | 100% |
div/sqrt | 66% |
other | 93% |
all | 79% |
load | 81% |
store | 100% |
mul | 88% |
add-sub | 89% |
fma | 100% |
div/sqrt | 66% |
other | 71% |
all | 55% |
load | 65% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 88% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 28% |
all | 88% |
load | 92% |
store | 100% |
mul | 82% |
add-sub | 100% |
fma | 100% |
div/sqrt | 70% |
other | 86% |
all | 73% |
load | 83% |
store | 100% |
mul | 90% |
add-sub | 90% |
fma | 100% |
div/sqrt | 70% |
other | 59% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
VMOVDQA32 %YMM26,%YMM23{%K2} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
MOV 0x168(%RSP),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD (%R15,%RDX,8),%XMM4 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VANDPD %ZMM7,%ZMM5,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VANDPD %ZMM7,%ZMM27,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VSUBPD %ZMM25,%ZMM12,%ZMM6 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMULPD %ZMM6,%ZMM24,%ZMM6 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VDIVSD %XMM4,%XMM13,%XMM17 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13-15 | 4 |
VBROADCASTSD %XMM17,%ZMM17 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VCMPPD $0x2,%ZMM24,%ZMM5,%K2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVAPD %ZMM5,%ZMM24{%K2} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VFMADD213PD %ZMM5,%ZMM25,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VPMOVSXDQ %YMM23,%ZMM18 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPSUBQ %ZMM0,%ZMM18,%ZMM18 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VXORPD %XMM19,%XMM19,%XMM19 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
KMOVQ %K3,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
VGATHERQPD (%R15,%ZMM18,8),%ZMM19{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VMOVAPD 0x300(%RSP),%ZMM18 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
VMOVAPD %ZMM19,%ZMM18{%K3} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVAPD %ZMM18,0x300(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0-1 | 1 |
VDIVPD %ZMM18,%ZMM5,%ZMM5 | 3 | 2.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 22-24 | 16 |
VFMADD231PD %ZMM17,%ZMM6,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM4,%XMM14,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VBROADCASTSD %XMM4,%ZMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMULPD %ZMM5,%ZMM4,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x2,%ZMM24,%ZMM4,%K2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVAPD %ZMM4,%ZMM24{%K2} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VFPCLASSPD $0x56,%ZMM27,%K2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VXORPD %ZMM15,%ZMM24,%ZMM24{%K2} | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VMOVAPD %ZMM24,%ZMM4{%K3}{z} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VSUBPD %ZMM25,%ZMM10,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VFMADD213PD %ZMM20,%ZMM4,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM22,%ZMM5,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
IMUL %R8,%RDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD 0xd8(%RSP),%RDX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
VMOVUPD %ZMM4,(%RDX,%RSI,8){%K1} | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0-1 | 1 |
ADD %R8,%R14 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD %R11,%R13 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP 0x38(%RSP),%R12 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
LEA 0x1(%R12),%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JE 43358b <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x1efb> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
TEST %R9,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
JE 4357c0 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x4130> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0x30(%RSP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA -0x1(%RDX,%R12,1),%RDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPBROADCASTD %EDX,%YMM23 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
SUB %RAX,%RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
LEA (%RBX,%R12,1),%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
LEA 0x1(%RBX,%R12,1),%EDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPBROADCASTD %EDI,%YMM24 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LEA -0x2(%RBX,%R12,1),%EDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPBROADCASTD %EDI,%YMM25 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPBROADCASTD %ESI,%YMM26 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
XOR %ESI,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 4355b3 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x3f23> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
MOV %R9,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CMP %R9,0x120(%RSP) | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
JE 435520 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x3e90> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
JMP 4357c2 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x4132> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
XOR %ESI,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VPBROADCASTQ %RSI,%ZMM25 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVDQA64 0x400(%RSP),%ZMM4 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
VPSUBQ %ZMM25,%ZMM4,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPCMPNLEUQ 0xd3f1f(%RIP),%ZMM5,%K1 | |||||||||||||||
KORTESTB %K1,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
JE 435520 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x3e90> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0x30(%RSP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA -0x1(%RDX,%R12,1),%R15 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %R15,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SUB %RAX,%RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %R11,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
IMUL %RDX,%RDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD 0x50(%RSP),%RDI | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
ADD %RCX,%RSI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
SUB %R10,%RSI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
VMOVUPD (%RDI,%RSI,8),%ZMM4{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
VMOVAPD %ZMM4,%ZMM22{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
LEA -0x2(%RBX,%R12,1),%EDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPBROADCASTD %EDI,%YMM23 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LEA 0x1(%RBX,%R12,1),%EDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VFPCLASSPD $0x50,%ZMM22,%K2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVDQA64 %YMM23,%YMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VPBROADCASTD %EDI,%YMM4{%K2} | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LEA (%RBX,%R12,1),%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
VPBROADCASTD %EDI,%YMM26 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPBROADCASTD %R15D,%YMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPBROADCASTD %EDI,%YMM5{%K2} | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVDQA64 %YMM26,%YMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VPBROADCASTD %R15D,%YMM6{%K2} | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPMOVSXDQ %YMM5,%ZMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPSUBQ %ZMM0,%ZMM5,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPMULLQ %ZMM5,%ZMM1,%ZMM17 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPXORD %XMM24,%XMM24,%XMM24 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.33 |
VPADDQ 0x2c0(%RSP),%ZMM25,%ZMM18 | 1 | 0.50 | 0 | 0.33 | 0.33 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.67 |
VPSUBQ %ZMM2,%ZMM18,%ZMM18 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPSLLQ $0x3,%ZMM18,%ZMM18 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2-4 | 1 |
VPADDQ %ZMM18,%ZMM8,%ZMM19 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPADDQ %ZMM17,%ZMM19,%ZMM17 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
KMOVQ %K1,%K3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
VPXORD %XMM19,%XMM19,%XMM19 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.33 |
VGATHERQPD (,%ZMM17,1),%ZMM19{%K3} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VANDPD %ZMM7,%ZMM22,%ZMM17 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPMULLQ %ZMM5,%ZMM3,%ZMM5 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM18,%ZMM9,%ZMM18 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPADDQ %ZMM5,%ZMM18,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
KMOVQ %K1,%K3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
VXORPD %XMM21,%XMM21,%XMM21 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VGATHERQPD (,%ZMM5,1),%ZMM21{%K3} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VPMOVSXDQ %YMM4,%ZMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPSUBQ %ZMM0,%ZMM4,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPMULLQ %ZMM4,%ZMM3,%ZMM4 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM4,%ZMM18,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
KMOVQ %K1,%K3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
VXORPD %XMM5,%XMM5,%XMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VGATHERQPD (,%ZMM4,1),%ZMM5{%K3} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VMOVAPD 0x340(%RSP),%ZMM4 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
VMOVAPD %ZMM19,%ZMM4{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVAPD %ZMM4,%ZMM19 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VDIVPD %ZMM4,%ZMM17,%ZMM25 | 3 | 2.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 22-24 | 16 |
VMOVAPD %ZMM21,%ZMM20{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVAPD 0x280(%RSP),%ZMM17 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
VMOVAPD %ZMM5,%ZMM17{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VPMOVSXDQ %YMM6,%ZMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPSUBQ %ZMM0,%ZMM4,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPMULLQ %ZMM4,%ZMM3,%ZMM4 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM4,%ZMM18,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
KMOVQ %K1,%K3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
VPXOR %XMM6,%XMM6,%XMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VGATHERQPD (,%ZMM4,1),%ZMM6{%K3} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VSUBPD %ZMM17,%ZMM20,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVAPD 0x3c0(%RSP),%ZMM4 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
VMOVAPD %ZMM6,%ZMM4{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVAPD %ZMM4,%ZMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VSUBPD %ZMM20,%ZMM4,%ZMM27 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMULPD %ZMM5,%ZMM27,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM4,%ZMM24,%K3{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
KORTESTB %K3,%K3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
VMOVAPD %ZMM6,0x3c0(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0-1 | 1 |
VMOVAPD %ZMM17,0x280(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0-1 | 1 |
VMOVAPD %ZMM19,0x340(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0-1 | 1 |
JNE 435440 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x3db0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0x168(%RSP),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
JMP 4354f5 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x3e65> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
Source file and lines | advec_mom_kernel.f90:81-241 |
Module | exec |
nb instructions | 143 |
nb uops | 182 |
loop length | 839 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 9 |
used ymm registers | 7 |
used zmm registers | 22 |
nb stack references | 12 |
ADD-SUB / MUL ratio | 0.80 |
micro-operation queue | 30.33 cycles |
front end | 30.33 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 46.00 | 11.00 | 18.67 | 18.67 | 2.50 | 46.00 | 9.00 | 2.50 | 2.50 | 2.50 | 9.00 | 18.67 |
cycles | 46.00 | 11.00 | 18.67 | 18.67 | 2.50 | 46.00 | 9.00 | 2.50 | 2.50 | 2.50 | 9.00 | 18.67 |
Cycles executing div or sqrt instructions | 36.00 |
FE+BE cycles | 47.03-89.14 |
Stall cycles | 18.90-60.96 |
ROB full (events) | 21.13-64.30 |
RS full (events) | 0.14-0.00 |
Front-end | 30.33 |
Dispatch | 46.00 |
DIV/SQRT | 36.00 |
Overall L1 | 46.00 |
all | 65% |
load | 60% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 86% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 46% |
all | 91% |
load | 90% |
store | 100% |
mul | 80% |
add-sub | 100% |
fma | 100% |
div/sqrt | 66% |
other | 93% |
all | 79% |
load | 81% |
store | 100% |
mul | 88% |
add-sub | 89% |
fma | 100% |
div/sqrt | 66% |
other | 71% |
all | 55% |
load | 65% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 88% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 28% |
all | 88% |
load | 92% |
store | 100% |
mul | 82% |
add-sub | 100% |
fma | 100% |
div/sqrt | 70% |
other | 86% |
all | 73% |
load | 83% |
store | 100% |
mul | 90% |
add-sub | 90% |
fma | 100% |
div/sqrt | 70% |
other | 59% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
VMOVDQA32 %YMM26,%YMM23{%K2} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
MOV 0x168(%RSP),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD (%R15,%RDX,8),%XMM4 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VANDPD %ZMM7,%ZMM5,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VANDPD %ZMM7,%ZMM27,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VSUBPD %ZMM25,%ZMM12,%ZMM6 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMULPD %ZMM6,%ZMM24,%ZMM6 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VDIVSD %XMM4,%XMM13,%XMM17 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13-15 | 4 |
VBROADCASTSD %XMM17,%ZMM17 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VCMPPD $0x2,%ZMM24,%ZMM5,%K2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVAPD %ZMM5,%ZMM24{%K2} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VFMADD213PD %ZMM5,%ZMM25,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VPMOVSXDQ %YMM23,%ZMM18 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPSUBQ %ZMM0,%ZMM18,%ZMM18 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VXORPD %XMM19,%XMM19,%XMM19 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
KMOVQ %K3,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
VGATHERQPD (%R15,%ZMM18,8),%ZMM19{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VMOVAPD 0x300(%RSP),%ZMM18 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
VMOVAPD %ZMM19,%ZMM18{%K3} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVAPD %ZMM18,0x300(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0-1 | 1 |
VDIVPD %ZMM18,%ZMM5,%ZMM5 | 3 | 2.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 22-24 | 16 |
VFMADD231PD %ZMM17,%ZMM6,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM4,%XMM14,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VBROADCASTSD %XMM4,%ZMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMULPD %ZMM5,%ZMM4,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x2,%ZMM24,%ZMM4,%K2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVAPD %ZMM4,%ZMM24{%K2} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VFPCLASSPD $0x56,%ZMM27,%K2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VXORPD %ZMM15,%ZMM24,%ZMM24{%K2} | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VMOVAPD %ZMM24,%ZMM4{%K3}{z} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VSUBPD %ZMM25,%ZMM10,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VFMADD213PD %ZMM20,%ZMM4,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM22,%ZMM5,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
IMUL %R8,%RDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD 0xd8(%RSP),%RDX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
VMOVUPD %ZMM4,(%RDX,%RSI,8){%K1} | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0-1 | 1 |
ADD %R8,%R14 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD %R11,%R13 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP 0x38(%RSP),%R12 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
LEA 0x1(%R12),%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JE 43358b <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x1efb> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
TEST %R9,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
JE 4357c0 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x4130> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0x30(%RSP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA -0x1(%RDX,%R12,1),%RDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPBROADCASTD %EDX,%YMM23 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
SUB %RAX,%RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
LEA (%RBX,%R12,1),%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
LEA 0x1(%RBX,%R12,1),%EDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPBROADCASTD %EDI,%YMM24 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LEA -0x2(%RBX,%R12,1),%EDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPBROADCASTD %EDI,%YMM25 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPBROADCASTD %ESI,%YMM26 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
XOR %ESI,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 4355b3 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x3f23> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
MOV %R9,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CMP %R9,0x120(%RSP) | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
JE 435520 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x3e90> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
JMP 4357c2 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x4132> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
XOR %ESI,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VPBROADCASTQ %RSI,%ZMM25 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVDQA64 0x400(%RSP),%ZMM4 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
VPSUBQ %ZMM25,%ZMM4,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPCMPNLEUQ 0xd3f1f(%RIP),%ZMM5,%K1 | |||||||||||||||
KORTESTB %K1,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
JE 435520 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x3e90> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0x30(%RSP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA -0x1(%RDX,%R12,1),%R15 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %R15,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SUB %RAX,%RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %R11,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
IMUL %RDX,%RDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD 0x50(%RSP),%RDI | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
ADD %RCX,%RSI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
SUB %R10,%RSI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
VMOVUPD (%RDI,%RSI,8),%ZMM4{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
VMOVAPD %ZMM4,%ZMM22{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
LEA -0x2(%RBX,%R12,1),%EDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPBROADCASTD %EDI,%YMM23 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LEA 0x1(%RBX,%R12,1),%EDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VFPCLASSPD $0x50,%ZMM22,%K2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVDQA64 %YMM23,%YMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VPBROADCASTD %EDI,%YMM4{%K2} | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LEA (%RBX,%R12,1),%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
VPBROADCASTD %EDI,%YMM26 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPBROADCASTD %R15D,%YMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPBROADCASTD %EDI,%YMM5{%K2} | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVDQA64 %YMM26,%YMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VPBROADCASTD %R15D,%YMM6{%K2} | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPMOVSXDQ %YMM5,%ZMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPSUBQ %ZMM0,%ZMM5,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPMULLQ %ZMM5,%ZMM1,%ZMM17 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPXORD %XMM24,%XMM24,%XMM24 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.33 |
VPADDQ 0x2c0(%RSP),%ZMM25,%ZMM18 | 1 | 0.50 | 0 | 0.33 | 0.33 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.67 |
VPSUBQ %ZMM2,%ZMM18,%ZMM18 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPSLLQ $0x3,%ZMM18,%ZMM18 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2-4 | 1 |
VPADDQ %ZMM18,%ZMM8,%ZMM19 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPADDQ %ZMM17,%ZMM19,%ZMM17 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
KMOVQ %K1,%K3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
VPXORD %XMM19,%XMM19,%XMM19 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.33 |
VGATHERQPD (,%ZMM17,1),%ZMM19{%K3} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VANDPD %ZMM7,%ZMM22,%ZMM17 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPMULLQ %ZMM5,%ZMM3,%ZMM5 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM18,%ZMM9,%ZMM18 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPADDQ %ZMM5,%ZMM18,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
KMOVQ %K1,%K3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
VXORPD %XMM21,%XMM21,%XMM21 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VGATHERQPD (,%ZMM5,1),%ZMM21{%K3} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VPMOVSXDQ %YMM4,%ZMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPSUBQ %ZMM0,%ZMM4,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPMULLQ %ZMM4,%ZMM3,%ZMM4 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM4,%ZMM18,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
KMOVQ %K1,%K3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
VXORPD %XMM5,%XMM5,%XMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VGATHERQPD (,%ZMM4,1),%ZMM5{%K3} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VMOVAPD 0x340(%RSP),%ZMM4 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
VMOVAPD %ZMM19,%ZMM4{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVAPD %ZMM4,%ZMM19 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VDIVPD %ZMM4,%ZMM17,%ZMM25 | 3 | 2.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 22-24 | 16 |
VMOVAPD %ZMM21,%ZMM20{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVAPD 0x280(%RSP),%ZMM17 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
VMOVAPD %ZMM5,%ZMM17{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VPMOVSXDQ %YMM6,%ZMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPSUBQ %ZMM0,%ZMM4,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPMULLQ %ZMM4,%ZMM3,%ZMM4 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM4,%ZMM18,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
KMOVQ %K1,%K3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
VPXOR %XMM6,%XMM6,%XMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VGATHERQPD (,%ZMM4,1),%ZMM6{%K3} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VSUBPD %ZMM17,%ZMM20,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVAPD 0x3c0(%RSP),%ZMM4 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
VMOVAPD %ZMM6,%ZMM4{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVAPD %ZMM4,%ZMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VSUBPD %ZMM20,%ZMM4,%ZMM27 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMULPD %ZMM5,%ZMM27,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM4,%ZMM24,%K3{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
KORTESTB %K3,%K3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
VMOVAPD %ZMM6,0x3c0(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0-1 | 1 |
VMOVAPD %ZMM17,0x280(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0-1 | 1 |
VMOVAPD %ZMM19,0x340(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0-1 | 1 |
JNE 435440 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x3db0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0x168(%RSP),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
JMP 4354f5 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x3e65> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |