Loop Id: 693 | Module: libqmcwfs.so | Source: MultiBsplineRef.hpp:227-262 [...] | Coverage: 0.01% |
---|
Loop Id: 693 | Module: libqmcwfs.so | Source: MultiBsplineRef.hpp:227-262 [...] | Coverage: 0.01% |
---|
0x6caa2 MOV -0x178(%RBP),%RAX |
0x6caa9 MOV %RSI,%R8 |
0x6caac SUB %RAX,%R8 |
0x6caaf MOV -0x170(%RBP),%RAX |
0x6cab6 MOV %R8,-0x160(%RBP) |
0x6cabd MOV -0x190(%RBP),%R8 |
0x6cac4 VMOVSD (%R8,%RAX,1),%XMM0 |
0x6caca MOV -0x198(%RBP),%R8 |
0x6cad1 VMOVSD (%R8,%RAX,1),%XMM5 |
0x6cad7 VMULSD %XMM0,%XMM16,%XMM19 |
0x6cadd MOV -0x188(%RBP),%R8 |
0x6cae4 VMULSD %XMM23,%XMM0,%XMM9 |
0x6caea VMULSD %XMM5,%XMM23,%XMM20 |
0x6caf0 VMULSD %XMM2,%XMM0,%XMM4 |
0x6caf4 VMULSD %XMM5,%XMM2,%XMM5 |
0x6caf8 VMULSD (%R8,%RAX,1),%XMM2,%XMM21 |
0x6caff XOR %EAX,%EAX |
0x6cb01 NOPL (%RAX) |
(692) 0x6cb08 MOV -0x160(%RBP),%R8 |
(692) 0x6cb0f VMOVSD (%RDI,%RAX,8),%XMM1 |
(692) 0x6cb14 VMOVSD (%R8,%RAX,8),%XMM8 |
(692) 0x6cb1a MOV -0x158(%RBP),%R8 |
(692) 0x6cb21 VMOVHPD (%R8,%RAX,8),%XMM1,%XMM0 |
(692) 0x6cb27 VMOVHPD (%RSI,%RAX,8),%XMM8,%XMM8 |
(692) 0x6cb2c MOV -0x168(%RBP),%R8 |
(692) 0x6cb33 VMULPD %XMM0,%XMM12,%XMM1 |
(692) 0x6cb37 VMULPD %XMM0,%XMM10,%XMM18 |
(692) 0x6cb3d VMULPD %XMM0,%XMM11,%XMM17 |
(692) 0x6cb43 VFMADD231PD %XMM13,%XMM8,%XMM1 |
(692) 0x6cb48 VFMADD231PD %XMM14,%XMM8,%XMM17 |
(692) 0x6cb4e VFMADD132PD %XMM15,%XMM18,%XMM8 |
(692) 0x6cb54 VUNPCKHPD %XMM1,%XMM1,%XMM0 |
(692) 0x6cb58 VADDPD %XMM1,%XMM0,%XMM0 |
(692) 0x6cb5c VUNPCKHPD %XMM17,%XMM17,%XMM1 |
(692) 0x6cb62 VUNPCKHPD %XMM8,%XMM8,%XMM22 |
(692) 0x6cb68 VADDPD %XMM17,%XMM1,%XMM1 |
(692) 0x6cb6e VADDPD %XMM8,%XMM22,%XMM8 |
(692) 0x6cb74 VMOVSD %XMM0,%XMM0,%XMM24 |
(692) 0x6cb7a VMOVSD %XMM0,%XMM0,%XMM25 |
(692) 0x6cb80 VMOVSD %XMM0,%XMM0,%XMM27 |
(692) 0x6cb86 VFMADD213SD (%R11,%RAX,8),%XMM19,%XMM24 |
(692) 0x6cb8d VMOVSD %XMM1,%XMM1,%XMM26 |
(692) 0x6cb93 VMOVSD %XMM1,%XMM1,%XMM28 |
(692) 0x6cb99 VMOVSD %XMM24,(%R11,%RAX,8) |
(692) 0x6cba0 VFMADD213SD (%R14,%RAX,8),%XMM20,%XMM25 |
(692) 0x6cba7 VMOVSD %XMM25,(%R14,%RAX,8) |
(692) 0x6cbae VFMADD213SD (%R13,%RAX,8),%XMM9,%XMM26 |
(692) 0x6cbb6 VMOVSD %XMM26,(%R13,%RAX,8) |
(692) 0x6cbbe VFMADD213SD (%R15,%RAX,8),%XMM21,%XMM27 |
(692) 0x6cbc5 VMOVSD %XMM27,(%R15,%RAX,8) |
(692) 0x6cbcc VFMADD213SD (%R12,%RAX,8),%XMM5,%XMM28 |
(692) 0x6cbd3 VMOVSD %XMM28,(%R12,%RAX,8) |
(692) 0x6cbda VFMADD213SD (%RBX,%RAX,8),%XMM4,%XMM8 |
(692) 0x6cbe0 VMOVSD %XMM8,(%RBX,%RAX,8) |
(692) 0x6cbe5 VMOVSD %XMM0,%XMM0,%XMM8 |
(692) 0x6cbe9 VFMADD213SD (%R9,%RAX,8),%XMM9,%XMM8 |
(692) 0x6cbef VMOVSD %XMM8,(%R9,%RAX,8) |
(692) 0x6cbf5 VMOVSD %XMM0,%XMM0,%XMM8 |
(692) 0x6cbf9 VFMADD213SD (%RDX,%RAX,8),%XMM4,%XMM0 |
(692) 0x6cbff VFMADD213SD (%RCX,%RAX,8),%XMM5,%XMM8 |
(692) 0x6cc05 VMOVSD %XMM0,(%RDX,%RAX,8) |
(692) 0x6cc0a VMOVSD %XMM8,(%RCX,%RAX,8) |
(692) 0x6cc0f VFMADD213SD (%R10,%RAX,8),%XMM4,%XMM1 |
(692) 0x6cc15 VMOVSD %XMM1,(%R10,%RAX,8) |
(692) 0x6cc1b INC %RAX |
(692) 0x6cc1e CMP %R8,%RAX |
(692) 0x6cc21 JNE 6cb08 |
0x6cc27 ADDQ $0x8,-0x170(%RBP) |
0x6cc2f MOV -0x180(%RBP),%RAX |
0x6cc36 ADD %RAX,-0x158(%RBP) |
0x6cc3d ADD %RAX,%RSI |
0x6cc40 ADD %RAX,%RDI |
0x6cc43 MOV -0x170(%RBP),%R8 |
0x6cc4a CMP $0x20,%R8 |
0x6cc4e JNE 6caa2 |
/home/eoseret/qaas_runs_CPU_9468/171-143-7755/intel/miniqmc/build/miniqmc/src/Numerics/Spline2/MultiBsplineRef.hpp: 227 - 262 |
-------------------------------------------------------------------------------- |
227: for (int j = 0; j < 4; j++) |
228: { |
229: const T* restrict coefs = spline_m->coefs + (ix + i) * xs + (iy + j) * ys + iz * zs; |
230: const T* restrict coefszs = coefs + zs; |
231: const T* restrict coefs2zs = coefs + 2 * zs; |
232: const T* restrict coefs3zs = coefs + 3 * zs; |
233: |
234: const T pre20 = d2a[i] * b[j]; |
235: const T pre10 = da[i] * b[j]; |
236: const T pre00 = a[i] * b[j]; |
237: const T pre11 = da[i] * db[j]; |
238: const T pre01 = a[i] * db[j]; |
239: const T pre02 = a[i] * d2b[j]; |
240: |
241: const int iSplitPoint = num_splines; |
242: for (int n = 0; n < iSplitPoint; n++) |
[...] |
249: T sum0 = c[0] * coefsv + c[1] * coefsvzs + c[2] * coefsv2zs + c[3] * coefsv3zs; |
250: T sum1 = dc[0] * coefsv + dc[1] * coefsvzs + dc[2] * coefsv2zs + dc[3] * coefsv3zs; |
251: T sum2 = d2c[0] * coefsv + d2c[1] * coefsvzs + d2c[2] * coefsv2zs + d2c[3] * coefsv3zs; |
252: |
253: hxx[n] += pre20 * sum0; |
254: hxy[n] += pre11 * sum0; |
255: hxz[n] += pre10 * sum1; |
256: hyy[n] += pre02 * sum0; |
257: hyz[n] += pre01 * sum1; |
258: hzz[n] += pre00 * sum2; |
259: gx[n] += pre10 * sum0; |
260: gy[n] += pre01 * sum0; |
261: gz[n] += pre00 * sum1; |
262: vals[n] += pre00 * sum0; |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.56 |
CQA speedup if FP arith vectorized | 2.38 |
CQA speedup if fully vectorized | 8.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.17 |
Bottlenecks | micro-operation queue, |
Function | _ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m |
Source | MultiBsplineRef.hpp:227-229,MultiBsplineRef.hpp:234-239 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 4.67 |
CQA cycles if no scalar integer | 3.00 |
CQA cycles if FP arith vectorized | 1.96 |
CQA cycles if fully vectorized | 0.58 |
Front-end cycles | 4.67 |
DIV/SQRT cycles | 3.00 |
P0 cycles | 3.00 |
P1 cycles | 4.00 |
P2 cycles | 4.00 |
P3 cycles | 1.50 |
P4 cycles | 2.00 |
P5 cycles | 2.00 |
P6 cycles | 1.50 |
P7 cycles | 1.50 |
P8 cycles | 1.50 |
P9 cycles | 2.00 |
P10 cycles | 4.00 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | 1 |
FE+BE cycles (UFS) | 5.18 |
Stall cycles (UFS) | 0.00 |
Nb insns | 26.00 |
Nb uops | 27.00 |
Nb loads | 12.00 |
Nb stores | 3.00 |
Nb stack references | 8.00 |
FLOP/cycle | 1.29 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 6.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 25.71 |
Bytes prefetched | 0.00 |
Bytes loaded | 96.00 |
Bytes stored | 24.00 |
Stride 0 | 1.00 |
Stride 1 | 0.00 |
Stride n | 0.00 |
Stride unknown | 14.00 |
Stride indirect | 0.00 |
Vectorization ratio all | 0.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | 0.00 |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | NA |
Vector-efficiency ratio all | 12.50 |
Vector-efficiency ratio load | 12.50 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | 12.50 |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | NA |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.56 |
CQA speedup if FP arith vectorized | 2.38 |
CQA speedup if fully vectorized | 8.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.17 |
Bottlenecks | micro-operation queue, |
Function | _ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m |
Source | MultiBsplineRef.hpp:227-229,MultiBsplineRef.hpp:234-239 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 4.67 |
CQA cycles if no scalar integer | 3.00 |
CQA cycles if FP arith vectorized | 1.96 |
CQA cycles if fully vectorized | 0.58 |
Front-end cycles | 4.67 |
DIV/SQRT cycles | 3.00 |
P0 cycles | 3.00 |
P1 cycles | 4.00 |
P2 cycles | 4.00 |
P3 cycles | 1.50 |
P4 cycles | 2.00 |
P5 cycles | 2.00 |
P6 cycles | 1.50 |
P7 cycles | 1.50 |
P8 cycles | 1.50 |
P9 cycles | 2.00 |
P10 cycles | 4.00 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | 1 |
FE+BE cycles (UFS) | 5.18 |
Stall cycles (UFS) | 0.00 |
Nb insns | 26.00 |
Nb uops | 27.00 |
Nb loads | 12.00 |
Nb stores | 3.00 |
Nb stack references | 8.00 |
FLOP/cycle | 1.29 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 6.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 25.71 |
Bytes prefetched | 0.00 |
Bytes loaded | 96.00 |
Bytes stored | 24.00 |
Stride 0 | 1.00 |
Stride 1 | 0.00 |
Stride n | 0.00 |
Stride unknown | 14.00 |
Stride indirect | 0.00 |
Vectorization ratio all | 0.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | 0.00 |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | NA |
Vector-efficiency ratio all | 12.50 |
Vector-efficiency ratio load | 12.50 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | 12.50 |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | NA |
Path / |
Function | _ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m |
Source file and lines | MultiBsplineRef.hpp:227-262 |
Module | libqmcwfs.so |
nb instructions | 26 |
nb uops | 27 |
loop length | 147 |
used x86 registers | 5 |
used mmx registers | 0 |
used xmm registers | 10 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 8 |
micro-operation queue | 4.67 cycles |
front end | 4.67 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 3.00 | 3.00 | 4.00 | 4.00 | 1.50 | 2.00 | 2.00 | 1.50 | 1.50 | 1.50 | 2.00 | 4.00 |
cycles | 3.00 | 3.00 | 4.00 | 4.00 | 1.50 | 2.00 | 2.00 | 1.50 | 1.50 | 1.50 | 2.00 | 4.00 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 1.00 |
FE+BE cycles | 5.18 |
Stall cycles | 0.00 |
Front-end | 4.67 |
Dispatch | 4.00 |
Data deps. | 1.00 |
Overall L1 | 4.67 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 0% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 12% |
load | 12% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 12% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 12% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MOV -0x178(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RSI,%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SUB %RAX,%R8 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV -0x170(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %R8,-0x160(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV -0x190(%RBP),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD (%R8,%RAX,1),%XMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x198(%RBP),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD (%R8,%RAX,1),%XMM5 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMULSD %XMM0,%XMM16,%XMM19 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV -0x188(%RBP),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMULSD %XMM23,%XMM0,%XMM9 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM5,%XMM23,%XMM20 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM2,%XMM0,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM5,%XMM2,%XMM5 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD (%R8,%RAX,1),%XMM2,%XMM21 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
XOR %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADDQ $0x8,-0x170(%RBP) | 2 | 0.20 | 0.20 | 0.33 | 0.33 | 0.50 | 0.20 | 0.20 | 0.50 | 0.50 | 0.50 | 0.20 | 0.33 | 1 | 0.50 |
MOV -0x180(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %RAX,-0x158(%RBP) | 2 | 0.20 | 0.20 | 0.33 | 0.33 | 0.50 | 0.20 | 0.20 | 0.50 | 0.50 | 0.50 | 0.20 | 0.33 | 1 | 0.50 |
ADD %RAX,%RSI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD %RAX,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV -0x170(%RBP),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
CMP $0x20,%R8 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JNE 6caa2 <_ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m+0x492> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
Function | _ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m |
Source file and lines | MultiBsplineRef.hpp:227-262 |
Module | libqmcwfs.so |
nb instructions | 26 |
nb uops | 27 |
loop length | 147 |
used x86 registers | 5 |
used mmx registers | 0 |
used xmm registers | 10 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 8 |
micro-operation queue | 4.67 cycles |
front end | 4.67 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 3.00 | 3.00 | 4.00 | 4.00 | 1.50 | 2.00 | 2.00 | 1.50 | 1.50 | 1.50 | 2.00 | 4.00 |
cycles | 3.00 | 3.00 | 4.00 | 4.00 | 1.50 | 2.00 | 2.00 | 1.50 | 1.50 | 1.50 | 2.00 | 4.00 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 1.00 |
FE+BE cycles | 5.18 |
Stall cycles | 0.00 |
Front-end | 4.67 |
Dispatch | 4.00 |
Data deps. | 1.00 |
Overall L1 | 4.67 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 0% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 12% |
load | 12% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 12% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 12% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MOV -0x178(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RSI,%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SUB %RAX,%R8 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV -0x170(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %R8,-0x160(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV -0x190(%RBP),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD (%R8,%RAX,1),%XMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x198(%RBP),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD (%R8,%RAX,1),%XMM5 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMULSD %XMM0,%XMM16,%XMM19 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV -0x188(%RBP),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMULSD %XMM23,%XMM0,%XMM9 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM5,%XMM23,%XMM20 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM2,%XMM0,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM5,%XMM2,%XMM5 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD (%R8,%RAX,1),%XMM2,%XMM21 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
XOR %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADDQ $0x8,-0x170(%RBP) | 2 | 0.20 | 0.20 | 0.33 | 0.33 | 0.50 | 0.20 | 0.20 | 0.50 | 0.50 | 0.50 | 0.20 | 0.33 | 1 | 0.50 |
MOV -0x180(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %RAX,-0x158(%RBP) | 2 | 0.20 | 0.20 | 0.33 | 0.33 | 0.50 | 0.20 | 0.20 | 0.50 | 0.50 | 0.50 | 0.20 | 0.33 | 1 | 0.50 |
ADD %RAX,%RSI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD %RAX,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV -0x170(%RBP),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
CMP $0x20,%R8 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JNE 6caa2 <_ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m+0x492> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |