Function: qmcplusplus::TinyVector<double, 3u> qmcplusplus::CrystalLattice<double, 3u>::toUnit_floor< ... | Module: libqmcwfs.so | Source: TinyVectorTensorOps.h:150-152 [...] | Coverage: 0.01% |
---|
Function: qmcplusplus::TinyVector<double, 3u> qmcplusplus::CrystalLattice<double, 3u>::toUnit_floor< ... | Module: libqmcwfs.so | Source: TinyVectorTensorOps.h:150-152 [...] | Coverage: 0.01% |
---|
/scratch_na/users/xoserete/qaas_runs/171-417-3180/intel/miniqmc/build/miniqmc/src/Particle/Lattice/CrystalLattice.h: 165 - 175 |
-------------------------------------------------------------------------------- |
165: inline SingleParticlePos toUnit_floor(const TinyVector<T1, D>& r) const |
166: { |
167: SingleParticlePos val_dot; |
168: val_dot = toUnit(r); |
169: for (int i = 0; i < D; i++) |
170: if (-std::numeric_limits<T1>::epsilon() < val_dot[i] && val_dot[i] < 0) |
171: val_dot[i] = T1(0.0); |
172: else |
173: val_dot[i] -= std::floor(val_dot[i]); |
174: return val_dot; |
175: } |
/scratch_na/users/xoserete/qaas_runs/171-417-3180/intel/miniqmc/build/miniqmc/src/Numerics/OhmmsPETE/TinyVectorTensorOps.h: 150 - 152 |
-------------------------------------------------------------------------------- |
150: return TinyVector<Type_t, 3>(lhs[0] * rhs[0] + lhs[1] * rhs[3] + lhs[2] * rhs[6], |
151: lhs[0] * rhs[1] + lhs[1] * rhs[4] + lhs[2] * rhs[7], |
152: lhs[0] * rhs[2] + lhs[1] * rhs[5] + lhs[2] * rhs[8]); |
0x13e40 VMOVUPD (%RDX),%XMM0 |
0x13e44 VMOVSD 0x10(%RDX),%XMM6 |
0x13e49 VXORPD %XMM9,%XMM9,%XMM9 |
0x13e4e MOV %RDI,%RAX |
0x13e51 VMOVDDUP 0x69947(%RIP),%XMM11 |
0x13e59 VPERMILPD $0x3,%XMM0,%XMM3 |
0x13e5f VPERMILPD $0,%XMM0,%XMM2 |
0x13e65 VUNPCKHPD %XMM0,%XMM0,%XMM1 |
0x13e69 VMULPD 0xa8(%RSI),%XMM3,%XMM8 |
0x13e71 VMOVDDUP %XMM6,%XMM7 |
0x13e75 VMOVSD %XMM0,%XMM0,%XMM4 |
0x13e79 VFMADD132PD 0x90(%RSI),%XMM8,%XMM2 |
0x13e82 VFMADD132PD 0xc0(%RSI),%XMM2,%XMM7 |
0x13e8b VXORPD %XMM2,%XMM2,%XMM2 |
0x13e8f VMULSD 0xb8(%RSI),%XMM1,%XMM5 |
0x13e97 VFMADD132SD 0xa0(%RSI),%XMM5,%XMM4 |
0x13ea0 VFMADD132SD 0xd0(%RSI),%XMM4,%XMM6 |
0x13ea9 VCMPPD $0x1,%XMM9,%XMM7,%XMM10 |
0x13eaf VCMPPD $0x1,%XMM7,%XMM11,%XMM12 |
0x13eb4 VROUNDPD $0x9,%XMM7,%XMM14 |
0x13eba VSUBPD %XMM14,%XMM7,%XMM15 |
0x13ebf VCOMISD %XMM6,%XMM2 |
0x13ec3 VPAND %XMM10,%XMM12,%XMM13 |
0x13ec8 VBLENDVPD %XMM13,%XMM9,%XMM15,%XMM0 |
0x13ece VMOVUPD %XMM0,(%RDI) |
0x13ed2 JBE 13ede |
0x13ed4 VCOMISD 0x698c4(%RIP),%XMM6 |
0x13edc JA 13ef0 |
0x13ede VROUNDSD $0x9,%XMM6,%XMM6,%XMM3 |
0x13ee4 VSUBSD %XMM3,%XMM6,%XMM4 |
0x13ee8 VMOVSD %XMM4,0x10(%RAX) |
0x13eed RET |
0x13eee XCHG %AX,%AX |
0x13ef0 VMOVSD %XMM2,%XMM2,%XMM4 |
0x13ef4 VMOVSD %XMM4,0x10(%RAX) |
0x13ef9 RET |
0x13efa NOPW (%RAX,%RAX,1) |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►42.18+ | qmcplusplus::SPOSet::evaluateD[...] | OhmmsVector.h:249 | libqmcwfs.so |
○ | miniqmcreference::DiracDetermi[...] | DiracDeterminantRef.cpp:217 | libqmcwfs.so |
○ | qmcplusplus::WaveFunction::eva[...] | WaveFunction.cpp:266 | libqmcwfs.so |
○ | main._omp_fn.1 | NonLocalPP.hpp:126 | exec |
○ | gomp_thread_start | team.c:130 | libgomp.so.1.0.0 |
►31.25+ | qmcplusplus::SPOSet::evaluateD[...] | OhmmsVector.h:249 | libqmcwfs.so |
○ | miniqmcreference::DiracDetermi[...] | DiracDeterminantRef.cpp:217 | libqmcwfs.so |
○ | qmcplusplus::WaveFunction::eva[...] | stl_vector.h:987 | libqmcwfs.so |
○ | main._omp_fn.1 | NonLocalPP.hpp:126 | exec |
○ | gomp_thread_start | team.c:130 | libgomp.so.1.0.0 |
►10.94+ | miniqmcreference::DiracDetermi[...] | DiracDeterminantRef.cpp:100 | libqmcwfs.so |
○ | qmcplusplus::WaveFunction::rat[...] | WaveFunction.cpp:198 | libqmcwfs.so |
○ | main._omp_fn.1 | refwrap.h:346 | exec |
○ | gomp_thread_start | team.c:130 | libgomp.so.1.0.0 |
►6.77+ | qmcplusplus::SPOSet::evaluate_[...] | OhmmsVector.h:210 | libqmcwfs.so |
○ | miniqmcreference::DiracDetermi[...] | DiracDeterminantRef.cpp:263 | libqmcwfs.so |
○ | miniqmcreference::DiracDetermi[...] | DiracDeterminantRef.cpp:238 | libqmcwfs.so |
○ | qmcplusplus::WaveFunction::eva[...] | WaveFunction.cpp:171 | libqmcwfs.so |
○ | main._omp_fn.0 | miniqmc.cpp:397 | exec |
○ | gomp_thread_start | team.c:130 | libgomp.so.1.0.0 |
►4.69+ | qmcplusplus::SPOSet::evaluate_[...] | OhmmsVector.h:210 | libqmcwfs.so |
○ | miniqmcreference::DiracDetermi[...] | DiracDeterminantRef.cpp:263 | libqmcwfs.so |
○ | miniqmcreference::DiracDetermi[...] | DiracDeterminantRef.cpp:238 | libqmcwfs.so |
○ | qmcplusplus::WaveFunction::eva[...] | WaveFunction.cpp:171 | libqmcwfs.so |
○ | main._omp_fn.0 | miniqmc.cpp:397 | exec |
○ | gomp_thread_start | team.c:130 | libgomp.so.1.0.0 |
►2.08+ | qmcplusplus::SPOSet::evaluateD[...] | OhmmsVector.h:249 | libqmcwfs.so |
○ | miniqmcreference::DiracDetermi[...] | DiracDeterminantRef.cpp:217 | libqmcwfs.so |
○ | qmcplusplus::WaveFunction::eva[...] | stl_vector.h:987 | libqmcwfs.so |
○ | main._omp_fn.1 | NonLocalPP.hpp:126 | exec |
○ | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
►1.04+ | miniqmcreference::DiracDetermi[...] | DiracDeterminantRef.cpp:100 | libqmcwfs.so |
○ | qmcplusplus::WaveFunction::rat[...] | WaveFunction.cpp:198 | libqmcwfs.so |
○ | main._omp_fn.1 | refwrap.h:346 | exec |
○ | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
Path / |
Source file and lines | TinyVectorTensorOps.h:150-152 |
Module | libqmcwfs.so |
nb instructions | 31 |
nb uops | 35.33 |
loop length | 168.67 |
used x86 registers | 4 |
used mmx registers | 0 |
used xmm registers | 16 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 0 |
ADD-SUB / MUL ratio | 0.83 |
micro-operation queue | 5.89 cycles |
front end | 5.89 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 8.06 | 7.94 | 3.56 | 3.56 | 1.00 | 8.00 | 2.67 | 1.00 | 1.00 | 1.00 | 0.00 | 3.56 |
cycles | 8.06 | 7.94 | 3.56 | 3.56 | 1.00 | 8.00 | 2.67 | 1.00 | 1.00 | 1.00 | 0.00 | 3.56 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 8.55-8.54 |
Stall cycles | 2.02-2.02 |
RS full (events) | 6.37-6.35 |
Front-end | 5.89 |
Dispatch | 8.06 |
Overall L1 | 8.06 |
all | 50% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 50% |
all | 53% |
load | 41% |
store | 50% |
mul | 50% |
add-sub | 66% |
fma | 50% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 58% |
all | 52% |
load | 41% |
store | 50% |
mul | 50% |
add-sub | 66% |
fma | 50% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 57% |
all | 18% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 18% |
all | 19% |
load | 17% |
store | 18% |
mul | 18% |
add-sub | 20% |
fma | 18% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 19% |
all | 19% |
load | 17% |
store | 18% |
mul | 18% |
add-sub | 20% |
fma | 18% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 19% |
Source file and lines | TinyVectorTensorOps.h:150-152 |
Module | libqmcwfs.so |
nb instructions | 30 |
nb uops | 34 |
loop length | 164 |
used x86 registers | 4 |
used mmx registers | 0 |
used xmm registers | 16 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 0 |
ADD-SUB / MUL ratio | 1.00 |
micro-operation queue | 5.67 cycles |
front end | 5.67 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 8.17 | 7.83 | 3.33 | 3.33 | 1.00 | 8.00 | 2.00 | 1.00 | 1.00 | 1.00 | 0.00 | 3.33 |
cycles | 8.17 | 7.83 | 3.33 | 3.33 | 1.00 | 8.00 | 2.00 | 1.00 | 1.00 | 1.00 | 0.00 | 3.33 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 8.62-8.58 |
Stall cycles | 2.32-2.31 |
RS full (events) | 7.06-7.04 |
Front-end | 5.67 |
Dispatch | 8.17 |
Overall L1 | 8.17 |
all | 50% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 50% |
all | 53% |
load | 44% |
store | 50% |
mul | 50% |
add-sub | 50% |
fma | 50% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 61% |
all | 53% |
load | 44% |
store | 50% |
mul | 50% |
add-sub | 50% |
fma | 50% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 60% |
all | 18% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 18% |
all | 19% |
load | 18% |
store | 18% |
mul | 18% |
add-sub | 18% |
fma | 18% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 20% |
all | 19% |
load | 18% |
store | 18% |
mul | 18% |
add-sub | 18% |
fma | 18% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 20% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
VMOVUPD (%RDX),%XMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VMOVSD 0x10(%RDX),%XMM6 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VXORPD %XMM9,%XMM9,%XMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %RDI,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
VMOVDDUP 0x69947(%RIP),%XMM11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VPERMILPD $0x3,%XMM0,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VPERMILPD $0,%XMM0,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VUNPCKHPD %XMM0,%XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VMULPD 0xa8(%RSI),%XMM3,%XMM8 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VMOVDDUP %XMM6,%XMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VMOVSD %XMM0,%XMM0,%XMM4 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VFMADD132PD 0x90(%RSI),%XMM8,%XMM2 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD132PD 0xc0(%RSI),%XMM2,%XMM7 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VXORPD %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VMULSD 0xb8(%RSI),%XMM1,%XMM5 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD132SD 0xa0(%RSI),%XMM5,%XMM4 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD132SD 0xd0(%RSI),%XMM4,%XMM6 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VCMPPD $0x1,%XMM9,%XMM7,%XMM10 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%XMM7,%XMM11,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VROUNDPD $0x9,%XMM7,%XMM14 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VSUBPD %XMM14,%XMM7,%XMM15 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VCOMISD %XMM6,%XMM2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPAND %XMM10,%XMM12,%XMM13 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VBLENDVPD %XMM13,%XMM9,%XMM15,%XMM0 | 3 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 2-3 | 1 |
VMOVUPD %XMM0,(%RDI) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0-1 | 0.50 |
JBE 13ede <_ZNK11qmcplusplus14CrystalLatticeIdLj3EE12toUnit_floorIdEENS_10TinyVectorIdLj3EEERKNS3_IT_Lj3EEE+0x9e> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
VROUNDSD $0x9,%XMM6,%XMM6,%XMM3 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VSUBSD %XMM3,%XMM6,%XMM4 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVSD %XMM4,0x10(%RAX) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
RET | 1 | 0.50 | 0 | 0.33 | 0.33 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0.33 | 0 | 2.13 |
Source file and lines | TinyVectorTensorOps.h:150-152 |
Module | libqmcwfs.so |
nb instructions | 32 |
nb uops | 37 |
loop length | 174 |
used x86 registers | 4 |
used mmx registers | 0 |
used xmm registers | 16 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 0 |
ADD-SUB / MUL ratio | 1.00 |
micro-operation queue | 6.17 cycles |
front end | 6.17 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 8.33 | 8.33 | 3.67 | 3.67 | 1.00 | 8.33 | 3.00 | 1.00 | 1.00 | 1.00 | 0.00 | 3.67 |
cycles | 8.33 | 8.33 | 3.67 | 3.67 | 1.00 | 8.33 | 3.00 | 1.00 | 1.00 | 1.00 | 0.00 | 3.67 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 8.88-8.84 |
Stall cycles | 2.07-2.03 |
RS full (events) | 6.46-6.42 |
Front-end | 6.17 |
Dispatch | 8.33 |
Overall L1 | 8.33 |
all | 50% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 50% |
all | 51% |
load | 40% |
store | 50% |
mul | 50% |
add-sub | 50% |
fma | 50% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 57% |
all | 51% |
load | 40% |
store | 50% |
mul | 50% |
add-sub | 50% |
fma | 50% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 56% |
all | 18% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 18% |
all | 18% |
load | 17% |
store | 18% |
mul | 18% |
add-sub | 18% |
fma | 18% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 19% |
all | 18% |
load | 17% |
store | 18% |
mul | 18% |
add-sub | 18% |
fma | 18% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 19% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
VMOVUPD (%RDX),%XMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VMOVSD 0x10(%RDX),%XMM6 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VXORPD %XMM9,%XMM9,%XMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %RDI,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
VMOVDDUP 0x69947(%RIP),%XMM11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VPERMILPD $0x3,%XMM0,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VPERMILPD $0,%XMM0,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VUNPCKHPD %XMM0,%XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VMULPD 0xa8(%RSI),%XMM3,%XMM8 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VMOVDDUP %XMM6,%XMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VMOVSD %XMM0,%XMM0,%XMM4 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VFMADD132PD 0x90(%RSI),%XMM8,%XMM2 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD132PD 0xc0(%RSI),%XMM2,%XMM7 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VXORPD %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VMULSD 0xb8(%RSI),%XMM1,%XMM5 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD132SD 0xa0(%RSI),%XMM5,%XMM4 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD132SD 0xd0(%RSI),%XMM4,%XMM6 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VCMPPD $0x1,%XMM9,%XMM7,%XMM10 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%XMM7,%XMM11,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VROUNDPD $0x9,%XMM7,%XMM14 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VSUBPD %XMM14,%XMM7,%XMM15 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VCOMISD %XMM6,%XMM2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPAND %XMM10,%XMM12,%XMM13 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VBLENDVPD %XMM13,%XMM9,%XMM15,%XMM0 | 3 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 2-3 | 1 |
VMOVUPD %XMM0,(%RDI) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0-1 | 0.50 |
JBE 13ede <_ZNK11qmcplusplus14CrystalLatticeIdLj3EE12toUnit_floorIdEENS_10TinyVectorIdLj3EEERKNS3_IT_Lj3EEE+0x9e> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
VCOMISD 0x698c4(%RIP),%XMM6 | 2 | 1 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 3 | 1 |
JA 13ef0 <_ZNK11qmcplusplus14CrystalLatticeIdLj3EE12toUnit_floorIdEENS_10TinyVectorIdLj3EEERKNS3_IT_Lj3EEE+0xb0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
VROUNDSD $0x9,%XMM6,%XMM6,%XMM3 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VSUBSD %XMM3,%XMM6,%XMM4 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVSD %XMM4,0x10(%RAX) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
RET | 1 | 0.50 | 0 | 0.33 | 0.33 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0.33 | 0 | 2.13 |
Source file and lines | TinyVectorTensorOps.h:150-152 |
Module | libqmcwfs.so |
nb instructions | 31 |
nb uops | 35 |
loop length | 168 |
used x86 registers | 4 |
used mmx registers | 0 |
used xmm registers | 16 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 0 |
ADD-SUB / MUL ratio | 0.50 |
micro-operation queue | 5.83 cycles |
front end | 5.83 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 7.67 | 7.67 | 3.67 | 3.67 | 1.00 | 7.67 | 3.00 | 1.00 | 1.00 | 1.00 | 0.00 | 3.67 |
cycles | 7.67 | 7.67 | 3.67 | 3.67 | 1.00 | 7.67 | 3.00 | 1.00 | 1.00 | 1.00 | 0.00 | 3.67 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 8.16-8.21 |
Stall cycles | 1.69-1.71 |
RS full (events) | 5.59-5.60 |
Front-end | 5.83 |
Dispatch | 7.67 |
Overall L1 | 7.67 |
all | 50% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 50% |
all | 53% |
load | 40% |
store | 50% |
mul | 50% |
add-sub | 100% |
fma | 50% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 57% |
all | 53% |
load | 40% |
store | 50% |
mul | 50% |
add-sub | 100% |
fma | 50% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 56% |
all | 18% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 18% |
all | 19% |
load | 17% |
store | 18% |
mul | 18% |
add-sub | 25% |
fma | 18% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 19% |
all | 19% |
load | 17% |
store | 18% |
mul | 18% |
add-sub | 25% |
fma | 18% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 19% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
VMOVUPD (%RDX),%XMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VMOVSD 0x10(%RDX),%XMM6 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VXORPD %XMM9,%XMM9,%XMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %RDI,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
VMOVDDUP 0x69947(%RIP),%XMM11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VPERMILPD $0x3,%XMM0,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VPERMILPD $0,%XMM0,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VUNPCKHPD %XMM0,%XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VMULPD 0xa8(%RSI),%XMM3,%XMM8 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VMOVDDUP %XMM6,%XMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VMOVSD %XMM0,%XMM0,%XMM4 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VFMADD132PD 0x90(%RSI),%XMM8,%XMM2 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD132PD 0xc0(%RSI),%XMM2,%XMM7 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VXORPD %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VMULSD 0xb8(%RSI),%XMM1,%XMM5 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD132SD 0xa0(%RSI),%XMM5,%XMM4 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD132SD 0xd0(%RSI),%XMM4,%XMM6 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VCMPPD $0x1,%XMM9,%XMM7,%XMM10 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%XMM7,%XMM11,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VROUNDPD $0x9,%XMM7,%XMM14 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VSUBPD %XMM14,%XMM7,%XMM15 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VCOMISD %XMM6,%XMM2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPAND %XMM10,%XMM12,%XMM13 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VBLENDVPD %XMM13,%XMM9,%XMM15,%XMM0 | 3 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 2-3 | 1 |
VMOVUPD %XMM0,(%RDI) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0-1 | 0.50 |
JBE 13ede <_ZNK11qmcplusplus14CrystalLatticeIdLj3EE12toUnit_floorIdEENS_10TinyVectorIdLj3EEERKNS3_IT_Lj3EEE+0x9e> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
VCOMISD 0x698c4(%RIP),%XMM6 | 2 | 1 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 3 | 1 |
JA 13ef0 <_ZNK11qmcplusplus14CrystalLatticeIdLj3EE12toUnit_floorIdEENS_10TinyVectorIdLj3EEERKNS3_IT_Lj3EEE+0xb0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
VMOVSD %XMM2,%XMM2,%XMM4 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VMOVSD %XMM4,0x10(%RAX) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
RET | 1 | 0.50 | 0 | 0.33 | 0.33 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0.33 | 0 | 2.13 |
Name | Coverage (%) | Time (s) |
---|---|---|
○qmcplusplus::TinyVector | 0.01 | 0.01 |