Loop Id: 343 | Module: exec | Source: flux_calc_kernel.f90:54-60 | Coverage: 0.01% |
---|
Loop Id: 343 | Module: exec | Source: flux_calc_kernel.f90:54-60 | Coverage: 0.01% |
---|
0x441d00 VPBROADCASTQ %RSI,%ZMM13 |
0x441d06 SUB -0x78(%RBP),%RAX |
0x441d0a IMUL %RAX,%R15 |
0x441d0e MOV %R15,-0x30(%RBP) |
0x441d12 VBROADCASTSD %XMM12,%ZMM12 |
0x441d18 LEA 0x1(%RAX),%RCX |
0x441d1c MOV %RAX,%RSI |
0x441d1f MOV %RDI,%RAX |
0x441d22 IMUL %RCX,%RAX |
0x441d26 IMUL %RSI,%RDI |
0x441d2a MOV %RDI,-0x58(%RBP) |
0x441d2e MOV -0x60(%RBP),%R15 |
0x441d32 MOV %R8,%RDI |
0x441d35 MOV %R15,%R8 |
0x441d38 IMUL %RSI,%R8 |
0x441d3c IMUL %RCX,%R15 |
0x441d40 IMUL %RSI,%RDI |
0x441d44 MOV %RDI,%RCX |
0x441d47 IMUL %RSI,%RDX |
0x441d4b IMUL %RSI,%R13 |
0x441d4f IMUL %RSI,%R12 |
0x441d53 IMUL %RSI,%R9 |
0x441d57 MOV %R9,-0x48(%RBP) |
0x441d5b XOR %R11D,%R11D |
0x441d5e MOV %RDX,%RDI |
0x441d61 MOV %RCX,%RDX |
0x441d64 MOV %R15,%R9 |
0x441d67 MOV -0x30(%RBP),%RCX |
0x441d6b MOV -0x58(%RBP),%RSI |
0x441d6f VPBROADCASTQ %R11,%ZMM14 |
0x441d75 ADD %R11,%R14 |
0x441d78 VPSUBQ %ZMM14,%ZMM13,%ZMM13 |
0x441d7e VPCMPNLEUQ %ZMM1,%ZMM13,%K1 |
0x441d85 ADD 0x38(%RBP),%RCX |
0x441d89 SUB -0xe8(%RBP),%R14 |
0x441d90 VMOVUPD (%RCX,%R14,8),%ZMM13{%K1}{z} |
0x441d97 MOV 0x28(%RBP),%RCX |
0x441d9b ADD %RCX,%RAX |
0x441d9e VMOVUPD (%RAX,%R14,8),%ZMM14{%K1}{z} |
0x441da5 ADD %RCX,%RSI |
0x441da8 VMOVUPD (%RSI,%R14,8),%ZMM15{%K1}{z} |
0x441daf MOV 0x18(%RBP),%RAX |
0x441db3 ADD %RAX,%R8 |
0x441db6 VMOVUPD (%R8,%R14,8),%ZMM16{%K1}{z} |
0x441dbd ADD %RAX,%R9 |
0x441dc0 VMOVUPD (%R9,%R14,8),%ZMM17{%K1}{z} |
0x441dc7 VMOVAPD %ZMM13,%ZMM11{%K1} |
0x441dcd VMULPD %ZMM12,%ZMM11,%ZMM13 |
0x441dd3 VMOVAPD %ZMM14,%ZMM10{%K1} |
0x441dd9 VMOVAPD %ZMM15,%ZMM9{%K1} |
0x441ddf VADDPD %ZMM9,%ZMM10,%ZMM14 |
0x441de5 VMOVAPD %ZMM16,%ZMM8{%K1} |
0x441deb VMOVAPD %ZMM17,%ZMM7{%K1} |
0x441df1 VADDPD %ZMM7,%ZMM8,%ZMM15 |
0x441df7 VADDPD %ZMM15,%ZMM14,%ZMM14 |
0x441dfd VMULPD %ZMM14,%ZMM13,%ZMM13 |
0x441e03 ADD -0x88(%RBP),%RDX |
0x441e0a VMOVUPD %ZMM13,(%RDX,%R14,8){%K1} |
0x441e11 ADD 0x30(%RBP),%RDI |
0x441e15 VMOVUPD (%RDI,%R14,8),%ZMM13{%K1}{z} |
0x441e1c ADD 0x20(%RBP),%R13 |
0x441e20 VMOVUPD 0x8(%R13,%R14,8),%ZMM14{%K1}{z} |
0x441e2b VMOVUPD (%R13,%R14,8),%ZMM15{%K1}{z} |
0x441e33 ADD 0x10(%RBP),%R12 |
0x441e37 VMOVUPD (%R12,%R14,8),%ZMM16{%K1}{z} |
0x441e3e VMOVUPD 0x8(%R12,%R14,8),%ZMM17{%K1}{z} |
0x441e49 VMOVAPD %ZMM13,%ZMM6{%K1} |
0x441e4f VMOVAPD %ZMM14,%ZMM5{%K1} |
0x441e55 VMULPD %ZMM12,%ZMM6,%ZMM12 |
0x441e5b VMOVAPD %ZMM15,%ZMM4{%K1} |
0x441e61 VADDPD %ZMM4,%ZMM5,%ZMM13 |
0x441e67 VMOVAPD %ZMM16,%ZMM3{%K1} |
0x441e6d VMOVAPD %ZMM17,%ZMM2{%K1} |
0x441e73 VADDPD %ZMM2,%ZMM3,%ZMM14 |
0x441e79 VADDPD %ZMM14,%ZMM13,%ZMM13 |
0x441e7f VMULPD %ZMM13,%ZMM12,%ZMM12 |
0x441e85 MOV -0x48(%RBP),%RAX |
0x441e89 ADD -0x80(%RBP),%RAX |
0x441e8d VMOVUPD %ZMM12,(%RAX,%R14,8){%K1} |
0x441e94 MOV -0x34(%RBP),%R8D |
0x441e98 MOV 0x60(%RBP),%RDX |
0x441e9c MOV 0x58(%RBP),%R9 |
0x441ea0 LEA 0x1(%RBX),%EAX |
0x441ea3 INC %R10D |
0x441ea6 CMP %R8D,%EBX |
0x441ea9 MOV %EAX,%EBX |
0x441eab JE 441b0d |
0x441eb1 MOVSXD (%RDX),%R14 |
0x441eb4 MOV (%R9),%ESI |
0x441eb7 MOV %ESI,%EAX |
0x441eb9 SUB %R14D,%EAX |
0x441ebc INC %EAX |
0x441ebe JS 441ea0 |
0x441ec0 MOV -0x110(%RBP),%RAX |
0x441ec7 ADD %EBX,%EAX |
0x441ec9 VMULSD 0x312f5f(%RIP),%XMM0,%XMM12 |
0x441ed1 MOV 0x68(%RBP),%RCX |
0x441ed5 MOV (%RCX),%R15 |
0x441ed8 MOV 0x70(%RBP),%RCX |
0x441edc MOV (%RCX),%RDI |
0x441edf MOV 0x78(%RBP),%RCX |
0x441ee3 MOV (%RCX),%RCX |
0x441ee6 MOV %RCX,-0x60(%RBP) |
0x441eea MOV 0x80(%RBP),%RCX |
0x441ef1 MOV (%RCX),%R8 |
0x441ef4 MOV 0x88(%RBP),%RCX |
0x441efb MOV (%RCX),%RDX |
0x441efe MOV 0x90(%RBP),%RCX |
0x441f05 MOV (%RCX),%R13 |
0x441f08 MOV 0x98(%RBP),%RCX |
0x441f0f MOV (%RCX),%R12 |
0x441f12 MOV 0xa0(%RBP),%RCX |
0x441f19 MOV (%RCX),%R9 |
0x441f1c SUB %R14D,%ESI |
0x441f1f ADD $0x2,%ESI |
0x441f22 CMP $0x2,%ESI |
0x441f25 MOV $0x1,%ECX |
0x441f2a CMOVL %ECX,%ESI |
0x441f2d MOV %RSI,%R11 |
0x441f30 CLTQ |
0x441f32 AND $0x7ffffff8,%R11 |
0x441f39 JE 441d00 |
0x441f3f MOV %RSI,-0x118(%RBP) |
0x441f46 MOV %RAX,-0xf0(%RBP) |
0x441f4d MOV %RBX,-0x120(%RBP) |
0x441f54 MOV %R10D,-0x68(%RBP) |
0x441f58 MOV %R9,%RBX |
0x441f5b MOVSXD %R10D,%R9 |
0x441f5e MOV -0xd8(%RBP),%RAX |
0x441f65 LEA (%RAX,%R9,1),%R10 |
0x441f69 ADD -0xe0(%RBP),%R9 |
0x441f70 MOV %R12,-0x108(%RBP) |
0x441f77 MOV %R12,%RAX |
0x441f7a IMUL %R10,%RAX |
0x441f7e LEA (%RAX,%R14,8),%RAX |
0x441f82 ADD -0x98(%RBP),%RAX |
0x441f89 MOV %R13,-0x100(%RBP) |
0x441f90 MOV %R13,%RCX |
0x441f93 IMUL %R10,%RCX |
0x441f97 MOV %R15,-0x30(%RBP) |
0x441f9b MOV %R14,%RSI |
0x441f9e MOV %RDI,%R14 |
0x441fa1 LEA (%RCX,%RSI,8),%RDI |
0x441fa5 ADD -0xd0(%RBP),%RDI |
0x441fac MOV %RBX,-0x48(%RBP) |
0x441fb0 MOV %RBX,%RCX |
0x441fb3 IMUL %R10,%RCX |
0x441fb7 LEA (%RCX,%RSI,8),%R12 |
0x441fbb ADD -0xc8(%RBP),%R12 |
0x441fc2 MOV %RDX,-0xf8(%RBP) |
0x441fc9 MOV %RDX,%RCX |
0x441fcc IMUL %R10,%RCX |
0x441fd0 MOV %R8,%RDX |
0x441fd3 LEA (%RCX,%RSI,8),%R8 |
0x441fd7 ADD -0xc0(%RBP),%R8 |
0x441fde MOV %RDX,-0x50(%RBP) |
0x441fe2 MOV %RDX,%RCX |
0x441fe5 IMUL %R10,%RCX |
0x441fe9 LEA (%RCX,%RSI,8),%RBX |
0x441fed ADD -0xb8(%RBP),%RBX |
0x441ff4 MOV -0x60(%RBP),%R13 |
0x441ff8 MOV %R13,%RCX |
0x441ffb IMUL %R9,%RCX |
0x441fff LEA (%RCX,%RSI,8),%RDX |
0x442003 MOV -0xb0(%RBP),%R15 |
0x44200a ADD %R15,%RDX |
0x44200d MOV %R13,%RCX |
0x442010 IMUL %R10,%RCX |
0x442014 LEA (%RCX,%RSI,8),%R13 |
0x442018 ADD %R15,%R13 |
0x44201b MOV %R14,%RCX |
0x44201e IMUL %R10,%RCX |
0x442022 LEA (%RCX,%RSI,8),%R15 |
0x442026 MOV -0xa8(%RBP),%RCX |
0x44202d ADD %RCX,%R15 |
0x442030 MOV %R14,-0x58(%RBP) |
0x442034 IMUL %R14,%R9 |
0x442038 MOV %RSI,%R14 |
0x44203b LEA (%R9,%RSI,8),%R9 |
0x44203f ADD %RCX,%R9 |
0x442042 IMUL -0x30(%RBP),%R10 |
0x442047 LEA (%R10,%RSI,8),%R10 |
0x44204b ADD -0xa0(%RBP),%R10 |
0x442052 VBROADCASTSD %XMM12,%ZMM12 |
0x442058 XOR %ECX,%ECX |
0x44205a NOPW (%RAX,%RAX,1) |
(344) 0x442060 VMULPD (%R10,%RCX,8),%ZMM12,%ZMM13 |
(344) 0x442067 VMOVUPD (%R9,%RCX,8),%ZMM14 |
(344) 0x44206e VADDPD (%R15,%RCX,8),%ZMM14,%ZMM14 |
(344) 0x442075 VADDPD (%R13,%RCX,8),%ZMM14,%ZMM14 |
(344) 0x44207d VADDPD (%RDX,%RCX,8),%ZMM14,%ZMM14 |
(344) 0x442084 VMULPD %ZMM14,%ZMM13,%ZMM13 |
(344) 0x44208a VMOVUPD %ZMM13,(%RBX,%RCX,8) |
(344) 0x442091 VMULPD (%R8,%RCX,8),%ZMM12,%ZMM13 |
(344) 0x442098 VMOVUPD (%RDI,%RCX,8),%ZMM14 |
(344) 0x44209f VADDPD -0x8(%RDI,%RCX,8),%ZMM14,%ZMM14 |
(344) 0x4420aa VADDPD -0x8(%RAX,%RCX,8),%ZMM14,%ZMM14 |
(344) 0x4420b5 VADDPD (%RAX,%RCX,8),%ZMM14,%ZMM14 |
(344) 0x4420bc VMULPD %ZMM14,%ZMM13,%ZMM13 |
(344) 0x4420c2 VMOVUPD %ZMM13,(%R12,%RCX,8) |
(344) 0x4420c9 ADD $0x8,%RCX |
(344) 0x4420cd CMP %R11,%RCX |
(344) 0x4420d0 JB 442060 |
0x4420d2 MOV -0x118(%RBP),%RSI |
0x4420d9 CMP %RSI,%R11 |
0x4420dc MOV -0x34(%RBP),%R8D |
0x4420e0 MOV 0x60(%RBP),%RDX |
0x4420e4 MOV 0x58(%RBP),%R9 |
0x4420e8 MOV -0x68(%RBP),%R10D |
0x4420ec MOV -0x120(%RBP),%RBX |
0x4420f3 JE 441ea0 |
0x4420f9 MOV -0xf0(%RBP),%RDI |
0x442100 SUB -0x78(%RBP),%RDI |
0x442104 LEA 0x1(%RDI),%RCX |
0x442108 MOV -0x58(%RBP),%RDX |
0x44210c MOV %RDX,%RAX |
0x44210f IMUL %RCX,%RAX |
0x442113 MOV -0x60(%RBP),%R9 |
0x442117 MOV %R9,%R8 |
0x44211a IMUL %RCX,%R9 |
0x44211e MOV -0x30(%RBP),%RCX |
0x442122 IMUL %RDI,%RCX |
0x442126 MOV %RCX,-0x30(%RBP) |
0x44212a IMUL %RDI,%RDX |
0x44212e IMUL %RDI,%R8 |
0x442132 MOV -0x50(%RBP),%RCX |
0x442136 IMUL %RDI,%RCX |
0x44213a MOV %RCX,-0x50(%RBP) |
0x44213e MOV -0xf8(%RBP),%R15 |
0x442145 IMUL %RDI,%R15 |
0x442149 MOV -0x100(%RBP),%R13 |
0x442150 IMUL %RDI,%R13 |
0x442154 MOV -0x108(%RBP),%R12 |
0x44215b IMUL %RDI,%R12 |
0x44215f MOV -0x48(%RBP),%RCX |
0x442163 IMUL %RDI,%RCX |
0x442167 MOV %RCX,-0x48(%RBP) |
0x44216b VPBROADCASTQ %RSI,%ZMM13 |
0x442171 MOV %RDX,%RSI |
0x442174 MOV -0x50(%RBP),%RDX |
0x442178 MOV %R15,%RDI |
0x44217b MOV -0x30(%RBP),%RCX |
0x44217f JMP 441d6f |
/scratch_na/users/xoserete/qaas_runs/171-214-9740/intel/CloverLeafFC/build/CloverLeafFC/CloverLeaf_ref/kernels/flux_calc_kernel.f90: 54 - 60 |
-------------------------------------------------------------------------------- |
54: DO k=y_min,y_max+1 |
55: !$OMP SIMD |
56: DO j=x_min,x_max+1 |
57: vol_flux_x(j,k)=0.25_8*dt*xarea(j,k) & |
58: *(xvel0(j,k)+xvel0(j,k+1)+xvel1(j,k)+xvel1(j,k+1)) |
59: vol_flux_y(j,k)=0.25_8*dt*yarea(j,k) & |
60: *(yvel0(j,k)+yvel0(j+1,k)+yvel1(j,k)+yvel1(j+1,k)) |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 3.69 |
CQA speedup if FP arith vectorized | 1.20 |
CQA speedup if fully vectorized | 1.53 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.21 |
Bottlenecks | |
Function | flux_calc_kernel_.DIR.OMP.PARALLEL.2 |
Source | flux_calc_kernel.f90:54-60 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 18.29 |
CQA cycles if no scalar integer | 4.96 |
CQA cycles if FP arith vectorized | 15.25 |
CQA cycles if fully vectorized | 11.92 |
Front-end cycles | 18.29 |
DIV/SQRT cycles | 8.65 |
P0 cycles | 14.99 |
P1 cycles | 13.33 |
P2 cycles | 13.33 |
P3 cycles | 4.38 |
P4 cycles | 8.65 |
P5 cycles | 8.60 |
P6 cycles | 4.38 |
P7 cycles | 4.38 |
P8 cycles | 4.38 |
P9 cycles | 8.60 |
P10 cycles | 13.33 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | 19.75 - 19.77 |
Stall cycles (UFS) | 1.30 - 1.29 |
Nb insns | 110.50 |
Nb uops | 109.75 |
Nb loads | 40.00 |
Nb stores | 8.75 |
Nb stack references | 26.00 |
FLOP/cycle | 2.23 |
Nb FLOP add-sub | 24.00 |
Nb FLOP mul | 16.75 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 30.20 |
Bytes prefetched | 0.00 |
Bytes loaded | 586.00 |
Bytes stored | 124.00 |
Stride 0 | 1.50 |
Stride 1 | 0.00 |
Stride n | 0.50 |
Stride unknown | 11.75 |
Stride indirect | 0.00 |
Vectorization ratio all | 29.63 |
Vectorization ratio load | 37.09 |
Vectorization ratio store | 15.03 |
Vectorization ratio mul | 41.27 |
Vectorization ratio add_sub | 43.75 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 28.22 |
Vector-efficiency ratio all | 36.88 |
Vector-efficiency ratio load | 44.04 |
Vector-efficiency ratio store | 25.36 |
Vector-efficiency ratio mul | 48.61 |
Vector-efficiency ratio add_sub | 49.02 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 34.53 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 14.67 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.31 |
Bottlenecks | micro-operation queue, |
Function | flux_calc_kernel_.DIR.OMP.PARALLEL.2 |
Source | flux_calc_kernel.f90:54-60 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 1.83 |
CQA cycles if no scalar integer | 1.83 |
CQA cycles if FP arith vectorized | 1.83 |
CQA cycles if fully vectorized | 0.13 |
Front-end cycles | 1.83 |
DIV/SQRT cycles | 1.40 |
P0 cycles | 1.40 |
P1 cycles | 0.67 |
P2 cycles | 0.67 |
P3 cycles | 0.00 |
P4 cycles | 1.40 |
P5 cycles | 1.40 |
P6 cycles | 0.00 |
P7 cycles | 0.00 |
P8 cycles | 0.00 |
P9 cycles | 1.40 |
P10 cycles | 0.67 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | 1.94 - 2.07 |
Stall cycles (UFS) | 0.00 |
Nb insns | 11.00 |
Nb uops | 11.00 |
Nb loads | 2.00 |
Nb stores | 0.00 |
Nb stack references | 0.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 4.36 |
Bytes prefetched | 0.00 |
Bytes loaded | 8.00 |
Bytes stored | 0.00 |
Stride 0 | 0.00 |
Stride 1 | 0.00 |
Stride n | 0.00 |
Stride unknown | 2.00 |
Stride indirect | 0.00 |
Vectorization ratio all | 0.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | NA |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 7.81 |
Vector-efficiency ratio load | 9.38 |
Vector-efficiency ratio store | NA |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 6.25 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 6.25 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 2.61 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 1.13 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.28 |
Bottlenecks | micro-operation queue, |
Function | flux_calc_kernel_.DIR.OMP.PARALLEL.2 |
Source | flux_calc_kernel.f90:54-60 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 20.00 |
CQA cycles if no scalar integer | 7.67 |
CQA cycles if FP arith vectorized | 19.93 |
CQA cycles if fully vectorized | 17.68 |
Front-end cycles | 20.00 |
DIV/SQRT cycles | 10.60 |
P0 cycles | 15.10 |
P1 cycles | 15.67 |
P2 cycles | 15.67 |
P3 cycles | 3.00 |
P4 cycles | 10.60 |
P5 cycles | 10.60 |
P6 cycles | 3.00 |
P7 cycles | 3.00 |
P8 cycles | 3.00 |
P9 cycles | 10.60 |
P10 cycles | 15.67 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | 23.05 - 23.01 |
Stall cycles (UFS) | 2.85 - 2.81 |
Nb insns | 122.00 |
Nb uops | 120.00 |
Nb loads | 47.00 |
Nb stores | 6.00 |
Nb stack references | 26.00 |
FLOP/cycle | 4.05 |
Nb FLOP add-sub | 48.00 |
Nb FLOP mul | 33.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 54.20 |
Bytes prefetched | 0.00 |
Bytes loaded | 924.00 |
Bytes stored | 160.00 |
Stride 0 | 2.00 |
Stride 1 | 0.00 |
Stride n | 1.00 |
Stride unknown | 11.00 |
Stride indirect | 0.00 |
Vectorization ratio all | 65.38 |
Vectorization ratio load | 76.92 |
Vectorization ratio store | 33.33 |
Vectorization ratio mul | 57.14 |
Vectorization ratio add_sub | 87.50 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 57.89 |
Vector-efficiency ratio all | 69.35 |
Vector-efficiency ratio load | 79.81 |
Vector-efficiency ratio store | 41.67 |
Vector-efficiency ratio mul | 62.50 |
Vector-efficiency ratio add_sub | 88.28 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 62.50 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 3.92 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 1.15 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.17 |
Bottlenecks | micro-operation queue, |
Function | flux_calc_kernel_.DIR.OMP.PARALLEL.2 |
Source | flux_calc_kernel.f90:54-60 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 32.67 |
CQA cycles if no scalar integer | 8.33 |
CQA cycles if FP arith vectorized | 32.59 |
CQA cycles if fully vectorized | 28.43 |
Front-end cycles | 32.67 |
DIV/SQRT cycles | 14.60 |
P0 cycles | 28.03 |
P1 cycles | 24.33 |
P2 cycles | 24.33 |
P3 cycles | 8.50 |
P4 cycles | 14.60 |
P5 cycles | 14.40 |
P6 cycles | 8.50 |
P7 cycles | 8.50 |
P8 cycles | 8.50 |
P9 cycles | 14.40 |
P10 cycles | 24.33 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | 35.20 |
Stall cycles (UFS) | 2.36 |
Nb insns | 197.00 |
Nb uops | 196.00 |
Nb loads | 73.00 |
Nb stores | 17.00 |
Nb stack references | 44.00 |
FLOP/cycle | 2.48 |
Nb FLOP add-sub | 48.00 |
Nb FLOP mul | 33.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 41.88 |
Bytes prefetched | 0.00 |
Bytes loaded | 1124.00 |
Bytes stored | 244.00 |
Stride 0 | 2.00 |
Stride 1 | 0.00 |
Stride n | 1.00 |
Stride unknown | 21.00 |
Stride indirect | 0.00 |
Vectorization ratio all | 53.13 |
Vectorization ratio load | 71.43 |
Vectorization ratio store | 11.76 |
Vectorization ratio mul | 66.67 |
Vectorization ratio add_sub | 87.50 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 55.00 |
Vector-efficiency ratio all | 58.69 |
Vector-efficiency ratio load | 75.00 |
Vector-efficiency ratio store | 22.43 |
Vector-efficiency ratio mul | 70.83 |
Vector-efficiency ratio add_sub | 89.06 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 60.00 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 9.33 |
CQA speedup if FP arith vectorized | 2.81 |
CQA speedup if fully vectorized | 12.99 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.21 |
Bottlenecks | micro-operation queue, |
Function | flux_calc_kernel_.DIR.OMP.PARALLEL.2 |
Source | flux_calc_kernel.f90:54-60 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 18.67 |
CQA cycles if no scalar integer | 2.00 |
CQA cycles if FP arith vectorized | 6.64 |
CQA cycles if fully vectorized | 1.44 |
Front-end cycles | 18.67 |
DIV/SQRT cycles | 8.00 |
P0 cycles | 15.43 |
P1 cycles | 12.67 |
P2 cycles | 12.67 |
P3 cycles | 6.00 |
P4 cycles | 8.00 |
P5 cycles | 8.00 |
P6 cycles | 6.00 |
P7 cycles | 6.00 |
P8 cycles | 6.00 |
P9 cycles | 8.00 |
P10 cycles | 12.67 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | 18.80 - 18.81 |
Stall cycles (UFS) | 0.00 |
Nb insns | 112.00 |
Nb uops | 112.00 |
Nb loads | 38.00 |
Nb stores | 12.00 |
Nb stack references | 34.00 |
FLOP/cycle | 0.05 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 1.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 20.36 |
Bytes prefetched | 0.00 |
Bytes loaded | 288.00 |
Bytes stored | 92.00 |
Stride 0 | 2.00 |
Stride 1 | 0.00 |
Stride n | 0.00 |
Stride unknown | 13.00 |
Stride indirect | 0.00 |
Vectorization ratio all | 0.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | 0.00 |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 11.67 |
Vector-efficiency ratio load | 11.98 |
Vector-efficiency ratio store | 11.98 |
Vector-efficiency ratio mul | 12.50 |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 9.38 |
Path / |
Function | flux_calc_kernel_.DIR.OMP.PARALLEL.2 |
Source file and lines | flux_calc_kernel.f90:54-60 |
Module | exec |
nb instructions | 110.50 |
nb uops | 109.75 |
loop length | 506.25 |
used x86 registers | 13.25 |
used mmx registers | 0 |
used xmm registers | 1.50 |
used ymm registers | 0 |
used zmm registers | 8.75 |
nb stack references | 26 |
micro-operation queue | 18.29 cycles |
front end | 18.29 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 8.65 | 10.50 | 13.33 | 13.33 | 4.38 | 8.65 | 8.60 | 4.38 | 4.38 | 4.38 | 8.60 | 13.33 |
cycles | 8.65 | 14.99 | 13.33 | 13.33 | 4.38 | 8.65 | 8.60 | 4.38 | 4.38 | 4.38 | 8.60 | 13.33 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 0.00 |
FE+BE cycles | 19.75-19.77 |
Stall cycles | 1.30-1.29 |
LM full (events) | 1.73-1.71 |
Front-end | 18.29 |
Dispatch | 15.13 |
Data deps. | 0.00 |
Overall L1 | 18.29 |
all | 4% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 25% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 5% |
all | 62% |
load | 60% |
store | 100% |
mul | 53% |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 60% |
all | 29% |
load | 37% |
store | 15% |
mul | 41% |
add-sub | 43% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 28% |
all | 14% |
load | 11% |
store | 12% |
mul | 12% |
add-sub | 32% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 14% |
all | 67% |
load | 65% |
store | 100% |
mul | 59% |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 65% |
all | 36% |
load | 44% |
store | 25% |
mul | 48% |
add-sub | 49% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 34% |
Function | flux_calc_kernel_.DIR.OMP.PARALLEL.2 |
Source file and lines | flux_calc_kernel.f90:54-60 |
Module | exec |
nb instructions | 11 |
nb uops | 11 |
loop length | 32 |
used x86 registers | 8 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 0 |
micro-operation queue | 1.83 cycles |
front end | 1.83 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 1.40 | 1.40 | 0.67 | 0.67 | 0.00 | 1.40 | 1.40 | 0.00 | 0.00 | 0.00 | 1.40 | 0.67 |
cycles | 1.40 | 1.40 | 0.67 | 0.67 | 0.00 | 1.40 | 1.40 | 0.00 | 0.00 | 0.00 | 1.40 | 0.67 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 0.00 |
FE+BE cycles | 1.94-2.07 |
Stall cycles | 0.00 |
Front-end | 1.83 |
Dispatch | 1.40 |
Data deps. | 0.00 |
Overall L1 | 1.83 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 7% |
load | 9% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 6% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 6% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
LEA 0x1(%RBX),%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
INC %R10D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP %R8D,%EBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %EAX,%EBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
JE 441b0d <flux_calc_kernel_module_mp_flux_calc_kernel_.DIR.OMP.PARALLEL.2+0x8d> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOVSXD (%RDX),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%R9),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %ESI,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SUB %R14D,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
INC %EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JS 441ea0 <flux_calc_kernel_module_mp_flux_calc_kernel_.DIR.OMP.PARALLEL.2+0x420> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
Function | flux_calc_kernel_.DIR.OMP.PARALLEL.2 |
Source file and lines | flux_calc_kernel.f90:54-60 |
Module | exec |
nb instructions | 122 |
nb uops | 120 |
loop length | 575 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 2 |
used ymm registers | 0 |
used zmm registers | 17 |
nb stack references | 26 |
ADD-SUB / MUL ratio | 1.20 |
micro-operation queue | 20.00 cycles |
front end | 20.00 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 10.60 | 10.60 | 15.67 | 15.67 | 3.00 | 10.60 | 10.60 | 3.00 | 3.00 | 3.00 | 10.60 | 15.67 |
cycles | 10.60 | 15.10 | 15.67 | 15.67 | 3.00 | 10.60 | 10.60 | 3.00 | 3.00 | 3.00 | 10.60 | 15.67 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 0.00 |
FE+BE cycles | 23.05-23.01 |
Stall cycles | 2.85-2.81 |
LM full (events) | 3.86-3.82 |
Front-end | 20.00 |
Dispatch | 15.67 |
Data deps. | 0.00 |
Overall L1 | 20.00 |
all | 11% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 50% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 12% |
all | 94% |
load | 90% |
store | 100% |
mul | 80% |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 90% |
all | 65% |
load | 76% |
store | 33% |
mul | 57% |
add-sub | 87% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 57% |
all | 21% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 53% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 21% |
all | 94% |
load | 92% |
store | 100% |
mul | 82% |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 92% |
all | 69% |
load | 79% |
store | 41% |
mul | 62% |
add-sub | 88% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 62% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
VPBROADCASTQ %RSI,%ZMM13 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
SUB -0x78(%RBP),%RAX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
IMUL %RAX,%R15 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %R15,-0x30(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VBROADCASTSD %XMM12,%ZMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LEA 0x1(%RAX),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %RAX,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV %RDI,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
IMUL %RCX,%RAX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
IMUL %RSI,%RDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %RDI,-0x58(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV -0x60(%RBP),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %R8,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV %R15,%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
IMUL %RSI,%R8 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
IMUL %RCX,%R15 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
IMUL %RSI,%RDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %RDI,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
IMUL %RSI,%RDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
IMUL %RSI,%R13 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
IMUL %RSI,%R12 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
IMUL %RSI,%R9 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %R9,-0x48(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
XOR %R11D,%R11D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %RDX,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV %RCX,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV %R15,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV -0x30(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x58(%RBP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VPBROADCASTQ %R11,%ZMM14 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD %R11,%R14 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
VPSUBQ %ZMM14,%ZMM13,%ZMM13 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPCMPNLEUQ %ZMM1,%ZMM13,%K1 | |||||||||||||||
ADD 0x38(%RBP),%RCX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
SUB -0xe8(%RBP),%R14 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
VMOVUPD (%RCX,%R14,8),%ZMM13{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
MOV 0x28(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %RCX,%RAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
VMOVUPD (%RAX,%R14,8),%ZMM14{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
ADD %RCX,%RSI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
VMOVUPD (%RSI,%R14,8),%ZMM15{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
MOV 0x18(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %RAX,%R8 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
VMOVUPD (%R8,%R14,8),%ZMM16{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
ADD %RAX,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
VMOVUPD (%R9,%R14,8),%ZMM17{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
VMOVAPD %ZMM13,%ZMM11{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMULPD %ZMM12,%ZMM11,%ZMM13 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVAPD %ZMM14,%ZMM10{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVAPD %ZMM15,%ZMM9{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VADDPD %ZMM9,%ZMM10,%ZMM14 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVAPD %ZMM16,%ZMM8{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVAPD %ZMM17,%ZMM7{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VADDPD %ZMM7,%ZMM8,%ZMM15 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VADDPD %ZMM15,%ZMM14,%ZMM14 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMULPD %ZMM14,%ZMM13,%ZMM13 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
ADD -0x88(%RBP),%RDX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
VMOVUPD %ZMM13,(%RDX,%R14,8){%K1} | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0-1 | 1 |
ADD 0x30(%RBP),%RDI | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
VMOVUPD (%RDI,%R14,8),%ZMM13{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
ADD 0x20(%RBP),%R13 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
VMOVUPD 0x8(%R13,%R14,8),%ZMM14{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
VMOVUPD (%R13,%R14,8),%ZMM15{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
ADD 0x10(%RBP),%R12 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
VMOVUPD (%R12,%R14,8),%ZMM16{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
VMOVUPD 0x8(%R12,%R14,8),%ZMM17{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
VMOVAPD %ZMM13,%ZMM6{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVAPD %ZMM14,%ZMM5{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMULPD %ZMM12,%ZMM6,%ZMM12 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVAPD %ZMM15,%ZMM4{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VADDPD %ZMM4,%ZMM5,%ZMM13 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVAPD %ZMM16,%ZMM3{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVAPD %ZMM17,%ZMM2{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VADDPD %ZMM2,%ZMM3,%ZMM14 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VADDPD %ZMM14,%ZMM13,%ZMM13 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMULPD %ZMM13,%ZMM12,%ZMM12 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV -0x48(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD -0x80(%RBP),%RAX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
VMOVUPD %ZMM12,(%RAX,%R14,8){%K1} | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0-1 | 1 |
MOV -0x34(%RBP),%R8D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x60(%RBP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x58(%RBP),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA 0x1(%RBX),%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
INC %R10D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP %R8D,%EBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %EAX,%EBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
JE 441b0d <flux_calc_kernel_module_mp_flux_calc_kernel_.DIR.OMP.PARALLEL.2+0x8d> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOVSXD (%RDX),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%R9),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %ESI,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SUB %R14D,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
INC %EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JS 441ea0 <flux_calc_kernel_module_mp_flux_calc_kernel_.DIR.OMP.PARALLEL.2+0x420> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV -0x110(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %EBX,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
VMULSD 0x312f5f(%RIP),%XMM0,%XMM12 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
MOV 0x68(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x70(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x78(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RCX,-0x60(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x80(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x88(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x90(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x98(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX),%R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0xa0(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %R14D,%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD $0x2,%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP $0x2,%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV $0x1,%ECX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMOVL %ECX,%ESI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
MOV %RSI,%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CLTQ | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
AND $0x7ffffff8,%R11 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
JE 441d00 <flux_calc_kernel_module_mp_flux_calc_kernel_.DIR.OMP.PARALLEL.2+0x280> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
Function | flux_calc_kernel_.DIR.OMP.PARALLEL.2 |
Source file and lines | flux_calc_kernel.f90:54-60 |
Module | exec |
nb instructions | 197 |
nb uops | 196 |
loop length | 931 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 2 |
used ymm registers | 0 |
used zmm registers | 17 |
nb stack references | 44 |
ADD-SUB / MUL ratio | 1.20 |
micro-operation queue | 32.67 cycles |
front end | 32.67 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 14.60 | 20.00 | 24.33 | 24.33 | 8.50 | 14.60 | 14.40 | 8.50 | 8.50 | 8.50 | 14.40 | 24.33 |
cycles | 14.60 | 28.03 | 24.33 | 24.33 | 8.50 | 14.60 | 14.40 | 8.50 | 8.50 | 8.50 | 14.40 | 24.33 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 0.00 |
FE+BE cycles | 35.20 |
Stall cycles | 2.36 |
LM full (events) | 3.04 |
Front-end | 32.67 |
Dispatch | 28.03 |
Data deps. | 0.00 |
Overall L1 | 32.67 |
all | 6% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 50% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 11% |
all | 94% |
load | 90% |
store | 100% |
mul | 80% |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 90% |
all | 53% |
load | 71% |
store | 11% |
mul | 66% |
add-sub | 87% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 55% |
all | 17% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 56% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 20% |
all | 94% |
load | 92% |
store | 100% |
mul | 82% |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 92% |
all | 58% |
load | 75% |
store | 22% |
mul | 70% |
add-sub | 89% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 60% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
VPBROADCASTQ %R11,%ZMM14 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD %R11,%R14 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
VPSUBQ %ZMM14,%ZMM13,%ZMM13 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPCMPNLEUQ %ZMM1,%ZMM13,%K1 | |||||||||||||||
ADD 0x38(%RBP),%RCX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
SUB -0xe8(%RBP),%R14 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
VMOVUPD (%RCX,%R14,8),%ZMM13{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
MOV 0x28(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %RCX,%RAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
VMOVUPD (%RAX,%R14,8),%ZMM14{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
ADD %RCX,%RSI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
VMOVUPD (%RSI,%R14,8),%ZMM15{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
MOV 0x18(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %RAX,%R8 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
VMOVUPD (%R8,%R14,8),%ZMM16{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
ADD %RAX,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
VMOVUPD (%R9,%R14,8),%ZMM17{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
VMOVAPD %ZMM13,%ZMM11{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMULPD %ZMM12,%ZMM11,%ZMM13 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVAPD %ZMM14,%ZMM10{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVAPD %ZMM15,%ZMM9{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VADDPD %ZMM9,%ZMM10,%ZMM14 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVAPD %ZMM16,%ZMM8{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVAPD %ZMM17,%ZMM7{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VADDPD %ZMM7,%ZMM8,%ZMM15 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VADDPD %ZMM15,%ZMM14,%ZMM14 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMULPD %ZMM14,%ZMM13,%ZMM13 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
ADD -0x88(%RBP),%RDX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
VMOVUPD %ZMM13,(%RDX,%R14,8){%K1} | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0-1 | 1 |
ADD 0x30(%RBP),%RDI | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
VMOVUPD (%RDI,%R14,8),%ZMM13{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
ADD 0x20(%RBP),%R13 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
VMOVUPD 0x8(%R13,%R14,8),%ZMM14{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
VMOVUPD (%R13,%R14,8),%ZMM15{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
ADD 0x10(%RBP),%R12 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
VMOVUPD (%R12,%R14,8),%ZMM16{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
VMOVUPD 0x8(%R12,%R14,8),%ZMM17{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
VMOVAPD %ZMM13,%ZMM6{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVAPD %ZMM14,%ZMM5{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMULPD %ZMM12,%ZMM6,%ZMM12 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVAPD %ZMM15,%ZMM4{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VADDPD %ZMM4,%ZMM5,%ZMM13 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVAPD %ZMM16,%ZMM3{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVAPD %ZMM17,%ZMM2{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VADDPD %ZMM2,%ZMM3,%ZMM14 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VADDPD %ZMM14,%ZMM13,%ZMM13 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMULPD %ZMM13,%ZMM12,%ZMM12 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV -0x48(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD -0x80(%RBP),%RAX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
VMOVUPD %ZMM12,(%RAX,%R14,8){%K1} | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0-1 | 1 |
MOV -0x34(%RBP),%R8D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x60(%RBP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x58(%RBP),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA 0x1(%RBX),%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
INC %R10D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP %R8D,%EBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %EAX,%EBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
JE 441b0d <flux_calc_kernel_module_mp_flux_calc_kernel_.DIR.OMP.PARALLEL.2+0x8d> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOVSXD (%RDX),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%R9),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %ESI,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SUB %R14D,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
INC %EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JS 441ea0 <flux_calc_kernel_module_mp_flux_calc_kernel_.DIR.OMP.PARALLEL.2+0x420> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV -0x110(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %EBX,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
VMULSD 0x312f5f(%RIP),%XMM0,%XMM12 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
MOV 0x68(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x70(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x78(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RCX,-0x60(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x80(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x88(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x90(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x98(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX),%R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0xa0(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %R14D,%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD $0x2,%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP $0x2,%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV $0x1,%ECX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMOVL %ECX,%ESI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
MOV %RSI,%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CLTQ | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
AND $0x7ffffff8,%R11 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
JE 441d00 <flux_calc_kernel_module_mp_flux_calc_kernel_.DIR.OMP.PARALLEL.2+0x280> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV %RSI,-0x118(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RAX,-0xf0(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RBX,-0x120(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R10D,-0x68(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R9,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOVSXD %R10D,%R9 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
MOV -0xd8(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%RAX,%R9,1),%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD -0xe0(%RBP),%R9 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
MOV %R12,-0x108(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R12,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
IMUL %R10,%RAX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LEA (%RAX,%R14,8),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD -0x98(%RBP),%RAX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
MOV %R13,-0x100(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R13,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
IMUL %R10,%RCX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %R15,-0x30(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R14,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV %RDI,%R14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
LEA (%RCX,%RSI,8),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD -0xd0(%RBP),%RDI | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
MOV %RBX,-0x48(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RBX,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
IMUL %R10,%RCX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LEA (%RCX,%RSI,8),%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD -0xc8(%RBP),%R12 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
MOV %RDX,-0xf8(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RDX,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
IMUL %R10,%RCX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %R8,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
LEA (%RCX,%RSI,8),%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD -0xc0(%RBP),%R8 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
MOV %RDX,-0x50(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RDX,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
IMUL %R10,%RCX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LEA (%RCX,%RSI,8),%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD -0xb8(%RBP),%RBX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
MOV -0x60(%RBP),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %R13,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
IMUL %R9,%RCX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LEA (%RCX,%RSI,8),%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV -0xb0(%RBP),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %R15,%RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %R13,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
IMUL %R10,%RCX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LEA (%RCX,%RSI,8),%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD %R15,%R13 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %R14,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
IMUL %R10,%RCX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LEA (%RCX,%RSI,8),%R15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV -0xa8(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %RCX,%R15 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %R14,-0x58(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
IMUL %R14,%R9 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %RSI,%R14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
LEA (%R9,%RSI,8),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD %RCX,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
IMUL -0x30(%RBP),%R10 | 1 | 0 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 3 | 1 |
LEA (%R10,%RSI,8),%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD -0xa0(%RBP),%R10 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
VBROADCASTSD %XMM12,%ZMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV -0x118(%RBP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
CMP %RSI,%R11 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV -0x34(%RBP),%R8D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x60(%RBP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x58(%RBP),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x68(%RBP),%R10D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x120(%RBP),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
JE 441ea0 <flux_calc_kernel_module_mp_flux_calc_kernel_.DIR.OMP.PARALLEL.2+0x420> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV -0xf0(%RBP),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB -0x78(%RBP),%RDI | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
LEA 0x1(%RDI),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV -0x58(%RBP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RDX,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
IMUL %RCX,%RAX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV -0x60(%RBP),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %R9,%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
IMUL %RCX,%R9 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV -0x30(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
IMUL %RDI,%RCX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %RCX,-0x30(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
IMUL %RDI,%RDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
IMUL %RDI,%R8 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV -0x50(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
IMUL %RDI,%RCX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %RCX,-0x50(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV -0xf8(%RBP),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
IMUL %RDI,%R15 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV -0x100(%RBP),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
IMUL %RDI,%R13 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV -0x108(%RBP),%R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
IMUL %RDI,%R12 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV -0x48(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
IMUL %RDI,%RCX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %RCX,-0x48(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VPBROADCASTQ %RSI,%ZMM13 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %RDX,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV -0x50(%RBP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %R15,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV -0x30(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
JMP 441d6f <flux_calc_kernel_module_mp_flux_calc_kernel_.DIR.OMP.PARALLEL.2+0x2ef> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
Function | flux_calc_kernel_.DIR.OMP.PARALLEL.2 |
Source file and lines | flux_calc_kernel.f90:54-60 |
Module | exec |
nb instructions | 112 |
nb uops | 112 |
loop length | 487 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 2 |
used ymm registers | 0 |
used zmm registers | 1 |
nb stack references | 34 |
micro-operation queue | 18.67 cycles |
front end | 18.67 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 8.00 | 10.00 | 12.67 | 12.67 | 6.00 | 8.00 | 8.00 | 6.00 | 6.00 | 6.00 | 8.00 | 12.67 |
cycles | 8.00 | 15.43 | 12.67 | 12.67 | 6.00 | 8.00 | 8.00 | 6.00 | 6.00 | 6.00 | 8.00 | 12.67 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 0.00 |
FE+BE cycles | 18.80-18.81 |
Stall cycles | 0.00 |
Front-end | 18.67 |
Dispatch | 15.43 |
Data deps. | 0.00 |
Overall L1 | 18.67 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 0% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 11% |
load | 11% |
store | 11% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 8% |
all | 12% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 12% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
all | 11% |
load | 11% |
store | 11% |
mul | 12% |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 9% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
LEA 0x1(%RBX),%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
INC %R10D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP %R8D,%EBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %EAX,%EBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
JE 441b0d <flux_calc_kernel_module_mp_flux_calc_kernel_.DIR.OMP.PARALLEL.2+0x8d> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOVSXD (%RDX),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%R9),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %ESI,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SUB %R14D,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
INC %EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JS 441ea0 <flux_calc_kernel_module_mp_flux_calc_kernel_.DIR.OMP.PARALLEL.2+0x420> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV -0x110(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %EBX,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
VMULSD 0x312f5f(%RIP),%XMM0,%XMM12 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
MOV 0x68(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x70(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x78(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RCX,-0x60(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x80(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x88(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x90(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x98(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX),%R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0xa0(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %R14D,%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD $0x2,%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP $0x2,%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV $0x1,%ECX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMOVL %ECX,%ESI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
MOV %RSI,%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CLTQ | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
AND $0x7ffffff8,%R11 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
JE 441d00 <flux_calc_kernel_module_mp_flux_calc_kernel_.DIR.OMP.PARALLEL.2+0x280> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV %RSI,-0x118(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RAX,-0xf0(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RBX,-0x120(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R10D,-0x68(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R9,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOVSXD %R10D,%R9 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
MOV -0xd8(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%RAX,%R9,1),%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD -0xe0(%RBP),%R9 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
MOV %R12,-0x108(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R12,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
IMUL %R10,%RAX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LEA (%RAX,%R14,8),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD -0x98(%RBP),%RAX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
MOV %R13,-0x100(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R13,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
IMUL %R10,%RCX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %R15,-0x30(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R14,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV %RDI,%R14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
LEA (%RCX,%RSI,8),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD -0xd0(%RBP),%RDI | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
MOV %RBX,-0x48(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RBX,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
IMUL %R10,%RCX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LEA (%RCX,%RSI,8),%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD -0xc8(%RBP),%R12 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
MOV %RDX,-0xf8(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RDX,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
IMUL %R10,%RCX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %R8,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
LEA (%RCX,%RSI,8),%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD -0xc0(%RBP),%R8 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
MOV %RDX,-0x50(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RDX,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
IMUL %R10,%RCX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LEA (%RCX,%RSI,8),%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD -0xb8(%RBP),%RBX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
MOV -0x60(%RBP),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %R13,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
IMUL %R9,%RCX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LEA (%RCX,%RSI,8),%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV -0xb0(%RBP),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %R15,%RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %R13,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
IMUL %R10,%RCX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LEA (%RCX,%RSI,8),%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD %R15,%R13 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %R14,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
IMUL %R10,%RCX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LEA (%RCX,%RSI,8),%R15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV -0xa8(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %RCX,%R15 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %R14,-0x58(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
IMUL %R14,%R9 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %RSI,%R14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
LEA (%R9,%RSI,8),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD %RCX,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
IMUL -0x30(%RBP),%R10 | 1 | 0 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 3 | 1 |
LEA (%R10,%RSI,8),%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD -0xa0(%RBP),%R10 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
VBROADCASTSD %XMM12,%ZMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV -0x118(%RBP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
CMP %RSI,%R11 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV -0x34(%RBP),%R8D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x60(%RBP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x58(%RBP),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x68(%RBP),%R10D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x120(%RBP),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
JE 441ea0 <flux_calc_kernel_module_mp_flux_calc_kernel_.DIR.OMP.PARALLEL.2+0x420> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |