Loop Id: 438 | Module: libqmcparticle_omptarget.so | Source: ParticleBConds3DSoa.h:280-298 [...] | Coverage: 0.01% |
---|
Loop Id: 438 | Module: libqmcparticle_omptarget.so | Source: ParticleBConds3DSoa.h:280-298 [...] | Coverage: 0.01% |
---|
0x43e38 LEA 0x200(%R9),%ESI |
0x43e3f VMOVSD (%R13),%XMM6 |
0x43e45 VMOVSD (%R11),%XMM7 |
0x43e4a VMOVD %ESI,%XMM4 |
0x43e4e VPMINSD 0x40(%RSP),%XMM4,%XMM5 |
0x43e55 VMOVD %XMM5,%EDI |
0x43e59 VMOVSD (%R14),%XMM5 |
0x43e5e CMP %R9D,%EDI |
0x43e61 JLE 440b4 |
0x43e67 VMOVSD 0x110(%R12),%XMM10 |
0x43e71 MOV %R10,%RSI |
0x43e74 MOV %R9,%RAX |
0x43e77 VMOVSD 0x118(%R12),%XMM11 |
0x43e81 VMOVSD 0x120(%R12),%XMM12 |
0x43e8b MOV 0x50(%RSP),%RDX |
0x43e90 VMOVSD 0x128(%R12),%XMM13 |
0x43e9a VMOVSD 0x130(%R12),%XMM14 |
0x43ea4 VMOVSD %XMM10,0xb8(%RSP) |
0x43ead VMOVSD 0x138(%R12),%XMM15 |
0x43eb7 LEA (%RDX,%R9,1),%RDX |
0x43ebb VMOVSD 0xd8(%R12),%XMM10 |
0x43ec5 VMOVSD %XMM11,0xb0(%RSP) |
0x43ece VMOVSD 0x140(%R12),%XMM1 |
0x43ed8 SAL $0x3,%RDX |
0x43edc VMOVSD 0x148(%R12),%XMM2 |
0x43ee6 VMOVSD %XMM12,0xa8(%RSP) |
0x43eef VMOVSD 0x150(%R12),%XMM0 |
0x43ef9 VMOVSD %XMM10,0x58(%RSP) |
0x43eff VMOVSD 0xc8(%R12),%XMM3 |
0x43f09 VMOVSD 0xd0(%R12),%XMM4 |
0x43f13 VMOVSD %XMM2,0x78(%RSP) |
0x43f19 VMOVSD 0xf8(%R12),%XMM12 |
0x43f23 VMOVSD 0x100(%R12),%XMM11 |
0x43f2d VMOVSD %XMM0,0x70(%RSP) |
0x43f33 VMOVSD 0x108(%R12),%XMM10 |
0x43f3d VMOVSD %XMM13,0x90(%RSP) |
0x43f46 VMOVSD 0xf0(%R12),%XMM13 |
0x43f50 VMOVSD %XMM14,0xa0(%RSP) |
0x43f59 VMOVSD 0xe8(%R12),%XMM14 |
0x43f63 VMOVSD %XMM15,0x88(%RSP) |
0x43f6c VMOVSD 0xe0(%R12),%XMM15 |
0x43f76 VMOVSD %XMM1,0x80(%RSP) |
0x43f7f VMOVSD %XMM3,0x68(%RSP) |
0x43f85 VMOVSD %XMM4,0x60(%RSP) |
0x43f8b NOPL (%RAX,%RAX,1) |
(440) 0x43f90 VMOVSD (%RBX,%RDX,1),%XMM2 |
(440) 0x43f95 VMOVSD (%RBX,%RAX,8),%XMM1 |
(440) 0x43f9a VMOVSD (%RBX,%RSI,1),%XMM4 |
(440) 0x43f9f VSUBSD %XMM6,%XMM2,%XMM3 |
(440) 0x43fa3 VSUBSD %XMM5,%XMM1,%XMM0 |
(440) 0x43fa7 VMULSD 0xb0(%RSP),%XMM3,%XMM2 |
(440) 0x43fb0 VFMADD231SD 0xb8(%RSP),%XMM0,%XMM2 |
(440) 0x43fba VSUBSD %XMM7,%XMM4,%XMM4 |
(440) 0x43fbe VFMADD231SD 0xa8(%RSP),%XMM4,%XMM2 |
(440) 0x43fc8 VMULSD 0xa0(%RSP),%XMM3,%XMM1 |
(440) 0x43fd1 VFMADD231SD 0x90(%RSP),%XMM0,%XMM1 |
(440) 0x43fdb VMULSD 0x78(%RSP),%XMM3,%XMM3 |
(440) 0x43fe1 VFMADD231SD 0x88(%RSP),%XMM4,%XMM1 |
(440) 0x43feb VFMADD132SD 0x80(%RSP),%XMM3,%XMM0 |
(440) 0x43ff5 VFMADD231SD 0x70(%RSP),%XMM4,%XMM0 |
(440) 0x43ffc VANDPD %XMM2,%XMM9,%XMM4 |
(440) 0x44000 VORPD %XMM4,%XMM8,%XMM3 |
(440) 0x44004 VADDSD %XMM3,%XMM2,%XMM4 |
(440) 0x44008 VANDPD %XMM1,%XMM9,%XMM3 |
(440) 0x4400c VROUNDSD $0x3,%XMM4,%XMM4,%XMM4 |
(440) 0x44012 VSUBSD %XMM4,%XMM2,%XMM2 |
(440) 0x44016 VORPD %XMM3,%XMM8,%XMM4 |
(440) 0x4401a VADDSD %XMM4,%XMM1,%XMM3 |
(440) 0x4401e VANDPD %XMM0,%XMM9,%XMM4 |
(440) 0x44022 VROUNDSD $0x3,%XMM3,%XMM3,%XMM3 |
(440) 0x44028 VSUBSD %XMM3,%XMM1,%XMM1 |
(440) 0x4402c VORPD %XMM4,%XMM8,%XMM3 |
(440) 0x44030 VADDSD %XMM3,%XMM0,%XMM4 |
(440) 0x44034 VROUNDSD $0x3,%XMM4,%XMM4,%XMM4 |
(440) 0x4403a VSUBSD %XMM4,%XMM0,%XMM3 |
(440) 0x4403e VMULSD 0x60(%RSP),%XMM1,%XMM0 |
(440) 0x44044 VMULSD %XMM14,%XMM1,%XMM4 |
(440) 0x44049 VFMADD231SD 0x68(%RSP),%XMM2,%XMM0 |
(440) 0x44050 VMULSD %XMM11,%XMM1,%XMM1 |
(440) 0x44055 VFMADD231SD 0x58(%RSP),%XMM3,%XMM0 |
(440) 0x4405c VFMADD231SD %XMM15,%XMM2,%XMM4 |
(440) 0x44061 VFMADD132SD %XMM12,%XMM1,%XMM2 |
(440) 0x44066 VMOVSD %XMM0,(%RCX,%RAX,8) |
(440) 0x4406b VFMADD231SD %XMM13,%XMM3,%XMM4 |
(440) 0x44070 VFMADD231SD %XMM10,%XMM3,%XMM2 |
(440) 0x44075 VMOVSD %XMM4,(%RCX,%RDX,1) |
(440) 0x4407a VMOVSD %XMM2,(%RCX,%RSI,1) |
(440) 0x4407f VMOVSD (%RCX,%RDX,1),%XMM0 |
(440) 0x44084 VMOVSD (%RCX,%RAX,8),%XMM3 |
(440) 0x44089 ADD $0x8,%RDX |
(440) 0x4408d ADD $0x8,%RSI |
(440) 0x44091 VMULSD %XMM0,%XMM0,%XMM4 |
(440) 0x44095 VFMADD132SD %XMM3,%XMM4,%XMM3 |
(440) 0x4409a VFMADD132SD %XMM2,%XMM3,%XMM2 |
(440) 0x4409f VSQRTSD %XMM2,%XMM2,%XMM2 |
(440) 0x440a3 VMOVSD %XMM2,(%R8,%RAX,8) |
(440) 0x440a9 INC %RAX |
(440) 0x440ac CMP %EAX,%EDI |
(440) 0x440ae JG 43f90 |
0x440b4 ADD $0x200,%R9 |
0x440bb ADD $0x1000,%R10 |
0x440c2 CMP %R15,%R9 |
0x440c5 JNE 43e38 |
/scratch_na/users/xoserete/qaas_runs/171-417-3180/intel/miniqmc/build/miniqmc/src/Particle/Lattice/ParticleBConds3DSoa.h: 280 - 298 |
-------------------------------------------------------------------------------- |
280: T displ_0 = px[iat] - x0; |
281: T displ_1 = py[iat] - y0; |
282: T displ_2 = pz[iat] - z0; |
283: |
284: T ar_0 = displ_0 * g00 + displ_1 * g10 + displ_2 * g20; |
285: T ar_1 = displ_0 * g01 + displ_1 * g11 + displ_2 * g21; |
286: T ar_2 = displ_0 * g02 + displ_1 * g12 + displ_2 * g22; |
287: |
288: //put them in the box |
289: ar_0 -= round(ar_0); |
290: ar_1 -= round(ar_1); |
291: ar_2 -= round(ar_2); |
292: |
293: //unit2cart |
294: dx[iat] = ar_0 * r00 + ar_1 * r10 + ar_2 * r20; |
295: dy[iat] = ar_0 * r01 + ar_1 * r11 + ar_2 * r21; |
296: dz[iat] = ar_0 * r02 + ar_1 * r12 + ar_2 * r22; |
297: |
298: temp_r[iat] = std::sqrt(dx[iat] * dx[iat] + dy[iat] * dy[iat] + dz[iat] * dz[iat]); |
/scratch_na/users/xoserete/qaas_runs/171-417-3180/intel/miniqmc/build/miniqmc/src/Platforms/OMPTarget/OMPTargetMath.hpp: 38 - 38 |
-------------------------------------------------------------------------------- |
38: return a < b ? a : b; |
/scratch_na/users/xoserete/qaas_runs/171-417-3180/intel/miniqmc/build/miniqmc/src/Particle/SoaDistanceTableABOMPTarget.h: 215 - 228 |
-------------------------------------------------------------------------------- |
215: for (int team_id = 0; team_id < num_teams; team_id++) |
216: { |
217: const int first = ChunkSizePerTeam * team_id; |
218: const int last = omptarget::min(first + ChunkSizePerTeam, num_sources_local); |
219: |
220: T pos[D]; |
221: for (int idim = 0; idim < D; idim++) |
222: pos[idim] = target_pos_ptr[iat * D + idim]; |
[...] |
228: for (int iel = first; iel < last; iel++) |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►99.10+ | qmcplusplus::ParticleSet::upda[...] | stl_vector.h:987 | libqmcparticle.so |
○ | main._omp_fn.1 | NonLocalPP.hpp:135 | exec |
○ | gomp_thread_start | team.c:130 | libgomp.so.1.0.0 |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.13 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 8.42 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.05 |
Bottlenecks | |
Function | qmcplusplus::SoaDistanceTableABOMPTarget |
Source | ParticleBConds3DSoa.h:284-286,ParticleBConds3DSoa.h:294-296,OMPTargetMath.hpp:38-38,SoaDistanceTableABOMPTarget.h:215-215,SoaDistanceTableABOMPTarget.h:218-218,SoaDistanceTableABOMPTarget.h:222-222,SoaDistanceTableABOMPTarget.h:228-228 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 5.00 |
CQA cycles if no scalar integer | 4.42 |
CQA cycles if FP arith vectorized | 5.00 |
CQA cycles if fully vectorized | 0.59 |
Front-end cycles | 5.00 |
DIV/SQRT cycles | 1.95 |
P0 cycles | 1.90 |
P1 cycles | 4.50 |
P2 cycles | 4.50 |
P3 cycles | 3.00 |
P4 cycles | 1.90 |
P5 cycles | 1.85 |
P6 cycles | 3.00 |
P7 cycles | 3.00 |
P8 cycles | 3.00 |
P9 cycles | 1.90 |
P10 cycles | 4.50 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | 1 |
FE+BE cycles (UFS) | 5.24 - 5.25 |
Stall cycles (UFS) | 0.00 |
Nb insns | 31.00 |
Nb uops | 30.00 |
Nb loads | 13.50 |
Nb stores | 6.00 |
Nb stack references | 7.50 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 28.00 |
Bytes prefetched | 0.00 |
Bytes loaded | 116.00 |
Bytes stored | 48.00 |
Stride 0 | 4.50 |
Stride 1 | 1.50 |
Stride n | 2.00 |
Stride unknown | 0.00 |
Stride indirect | 0.00 |
Vectorization ratio all | 7.53 |
Vectorization ratio load | 14.77 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 22.50 |
Vector-efficiency ratio all | 12.03 |
Vector-efficiency ratio load | 14.35 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 11.09 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.09 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 8.17 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.04 |
Bottlenecks | micro-operation queue, |
Function | qmcplusplus::SoaDistanceTableABOMPTarget |
Source | ParticleBConds3DSoa.h:284-286,ParticleBConds3DSoa.h:294-296,OMPTargetMath.hpp:38-38,SoaDistanceTableABOMPTarget.h:215-215,SoaDistanceTableABOMPTarget.h:218-218,SoaDistanceTableABOMPTarget.h:222-222,SoaDistanceTableABOMPTarget.h:228-228 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 8.00 |
CQA cycles if no scalar integer | 7.33 |
CQA cycles if FP arith vectorized | 8.00 |
CQA cycles if fully vectorized | 0.98 |
Front-end cycles | 8.00 |
DIV/SQRT cycles | 2.00 |
P0 cycles | 2.00 |
P1 cycles | 7.67 |
P2 cycles | 7.67 |
P3 cycles | 6.00 |
P4 cycles | 2.00 |
P5 cycles | 2.00 |
P6 cycles | 6.00 |
P7 cycles | 6.00 |
P8 cycles | 6.00 |
P9 cycles | 2.00 |
P10 cycles | 7.67 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | 1 |
FE+BE cycles (UFS) | 8.12 |
Stall cycles (UFS) | 0.00 |
Nb insns | 49.00 |
Nb uops | 48.00 |
Nb loads | 23.00 |
Nb stores | 12.00 |
Nb stack references | 14.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 36.00 |
Bytes prefetched | 0.00 |
Bytes loaded | 192.00 |
Bytes stored | 96.00 |
Stride 0 | 5.00 |
Stride 1 | 3.00 |
Stride n | 4.00 |
Stride unknown | 0.00 |
Stride indirect | 0.00 |
Vectorization ratio all | 2.56 |
Vectorization ratio load | 4.55 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 20.00 |
Vector-efficiency ratio all | 12.34 |
Vector-efficiency ratio load | 13.07 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 11.25 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.33 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 9.60 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.05 |
Bottlenecks | micro-operation queue, |
Function | qmcplusplus::SoaDistanceTableABOMPTarget |
Source | ParticleBConds3DSoa.h:284-286,ParticleBConds3DSoa.h:294-296,OMPTargetMath.hpp:38-38,SoaDistanceTableABOMPTarget.h:215-215,SoaDistanceTableABOMPTarget.h:218-218,SoaDistanceTableABOMPTarget.h:222-222,SoaDistanceTableABOMPTarget.h:228-228 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 2.00 |
CQA cycles if no scalar integer | 1.50 |
CQA cycles if FP arith vectorized | 2.00 |
CQA cycles if fully vectorized | 0.21 |
Front-end cycles | 2.00 |
DIV/SQRT cycles | 1.90 |
P0 cycles | 1.80 |
P1 cycles | 1.33 |
P2 cycles | 1.33 |
P3 cycles | 0.00 |
P4 cycles | 1.80 |
P5 cycles | 1.70 |
P6 cycles | 0.00 |
P7 cycles | 0.00 |
P8 cycles | 0.00 |
P9 cycles | 1.80 |
P10 cycles | 1.33 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | 1 |
FE+BE cycles (UFS) | 2.35 - 2.38 |
Stall cycles (UFS) | 0.00 |
Nb insns | 13.00 |
Nb uops | 12.00 |
Nb loads | 4.00 |
Nb stores | 0.00 |
Nb stack references | 1.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 20.00 |
Bytes prefetched | 0.00 |
Bytes loaded | 40.00 |
Bytes stored | 0.00 |
Stride 0 | 4.00 |
Stride 1 | 0.00 |
Stride n | 0.00 |
Stride unknown | 0.00 |
Stride indirect | 0.00 |
Vectorization ratio all | 12.50 |
Vectorization ratio load | 25.00 |
Vectorization ratio store | NA |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 25.00 |
Vector-efficiency ratio all | 11.72 |
Vector-efficiency ratio load | 15.63 |
Vector-efficiency ratio store | NA |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 10.94 |
Path / |
Function | qmcplusplus::SoaDistanceTableABOMPTarget |
Source file and lines | ParticleBConds3DSoa.h:280-298 |
Module | libqmcparticle_omptarget.so |
nb instructions | 31 |
nb uops | 30 |
loop length | 218.50 |
used x86 registers | 10.50 |
used mmx registers | 0 |
used xmm registers | 9 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 7.50 |
micro-operation queue | 5.00 cycles |
front end | 5.00 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 1.95 | 1.90 | 4.50 | 4.50 | 3.00 | 1.90 | 1.85 | 3.00 | 3.00 | 3.00 | 1.90 | 4.50 |
cycles | 1.95 | 1.90 | 4.50 | 4.50 | 3.00 | 1.90 | 1.85 | 3.00 | 3.00 | 3.00 | 1.90 | 4.50 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 1.00 |
FE+BE cycles | 5.23-5.25 |
Stall cycles | 0.00 |
Front-end | 5.00 |
Dispatch | 4.78 |
Data deps. | 1.00 |
Overall L1 | 5.00 |
all | 18% |
load | 100% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 22% |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 7% |
load | 14% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 22% |
all | 11% |
load | 25% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 11% |
all | 12% |
load | 12% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 12% |
load | 14% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 11% |
Function | qmcplusplus::SoaDistanceTableABOMPTarget |
Source file and lines | ParticleBConds3DSoa.h:280-298 |
Module | libqmcparticle_omptarget.so |
nb instructions | 49 |
nb uops | 48 |
loop length | 367 |
used x86 registers | 12 |
used mmx registers | 0 |
used xmm registers | 14 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 14 |
micro-operation queue | 8.00 cycles |
front end | 8.00 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 2.00 | 2.00 | 7.67 | 7.67 | 6.00 | 2.00 | 2.00 | 6.00 | 6.00 | 6.00 | 2.00 | 7.67 |
cycles | 2.00 | 2.00 | 7.67 | 7.67 | 6.00 | 2.00 | 2.00 | 6.00 | 6.00 | 6.00 | 2.00 | 7.67 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 1.00 |
FE+BE cycles | 8.12 |
Stall cycles | 0.00 |
Front-end | 8.00 |
Dispatch | 7.67 |
Data deps. | 1.00 |
Overall L1 | 8.00 |
all | 16% |
load | 100% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 20% |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 2% |
load | 4% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 20% |
all | 11% |
load | 25% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 11% |
all | 12% |
load | 12% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 12% |
load | 13% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 11% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
LEA 0x200(%R9),%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
VMOVSD (%R13),%XMM6 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD (%R11),%XMM7 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVD %ESI,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPMINSD 0x40(%RSP),%XMM4,%XMM5 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.50 |
VMOVD %XMM5,%EDI | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVSD (%R14),%XMM5 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
CMP %R9D,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JLE 440b4 <_ZN11qmcplusplus27SoaDistanceTableABOMPTargetIdLj3ELi40EE8evaluateERNS_11ParticleSetE+0x754> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
VMOVSD 0x110(%R12),%XMM10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %R10,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV %R9,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
VMOVSD 0x118(%R12),%XMM11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD 0x120(%R12),%XMM12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x50(%RSP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD 0x128(%R12),%XMM13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD 0x130(%R12),%XMM14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD %XMM10,0xb8(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VMOVSD 0x138(%R12),%XMM15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%RDX,%R9,1),%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VMOVSD 0xd8(%R12),%XMM10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD %XMM11,0xb0(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VMOVSD 0x140(%R12),%XMM1 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SAL $0x3,%RDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
VMOVSD 0x148(%R12),%XMM2 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD %XMM12,0xa8(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VMOVSD 0x150(%R12),%XMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD %XMM10,0x58(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VMOVSD 0xc8(%R12),%XMM3 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD 0xd0(%R12),%XMM4 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD %XMM2,0x78(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VMOVSD 0xf8(%R12),%XMM12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD 0x100(%R12),%XMM11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD %XMM0,0x70(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VMOVSD 0x108(%R12),%XMM10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD %XMM13,0x90(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VMOVSD 0xf0(%R12),%XMM13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD %XMM14,0xa0(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VMOVSD 0xe8(%R12),%XMM14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD %XMM15,0x88(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VMOVSD 0xe0(%R12),%XMM15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD %XMM1,0x80(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VMOVSD %XMM3,0x68(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VMOVSD %XMM4,0x60(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD $0x200,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD $0x1000,%R10 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP %R15,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JNE 43e38 <_ZN11qmcplusplus27SoaDistanceTableABOMPTargetIdLj3ELi40EE8evaluateERNS_11ParticleSetE+0x4d8> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
Function | qmcplusplus::SoaDistanceTableABOMPTarget |
Source file and lines | ParticleBConds3DSoa.h:280-298 |
Module | libqmcparticle_omptarget.so |
nb instructions | 13 |
nb uops | 12 |
loop length | 70 |
used x86 registers | 9 |
used mmx registers | 0 |
used xmm registers | 4 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 1 |
micro-operation queue | 2.00 cycles |
front end | 2.00 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 1.90 | 1.80 | 1.33 | 1.33 | 0.00 | 1.80 | 1.70 | 0.00 | 0.00 | 0.00 | 1.80 | 1.33 |
cycles | 1.90 | 1.80 | 1.33 | 1.33 | 0.00 | 1.80 | 1.70 | 0.00 | 0.00 | 0.00 | 1.80 | 1.33 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 1.00 |
FE+BE cycles | 2.35-2.38 |
Stall cycles | 0.00 |
Front-end | 2.00 |
Dispatch | 1.90 |
Data deps. | 1.00 |
Overall L1 | 2.00 |
all | 20% |
load | 100% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 25% |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 12% |
load | 25% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 25% |
all | 11% |
load | 25% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 10% |
all | 12% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 11% |
load | 15% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 10% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
LEA 0x200(%R9),%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
VMOVSD (%R13),%XMM6 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD (%R11),%XMM7 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVD %ESI,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPMINSD 0x40(%RSP),%XMM4,%XMM5 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.50 |
VMOVD %XMM5,%EDI | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVSD (%R14),%XMM5 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
CMP %R9D,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JLE 440b4 <_ZN11qmcplusplus27SoaDistanceTableABOMPTargetIdLj3ELi40EE8evaluateERNS_11ParticleSetE+0x754> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
ADD $0x200,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD $0x1000,%R10 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP %R15,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JNE 43e38 <_ZN11qmcplusplus27SoaDistanceTableABOMPTargetIdLj3ELi40EE8evaluateERNS_11ParticleSetE+0x4d8> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |