Loop Id: 106 | Module: exec | Source: ljForce.c:173-216 [...] | Coverage: 0.02% |
---|
Loop Id: 106 | Module: exec | Source: ljForce.c:173-216 [...] | Coverage: 0.02% |
---|
0x2103b0 MOV -0x48(%RBP),%RSI |
0x2103b4 ADDL $0x40,-0x3c(%RBP) |
0x2103b8 LEA 0x1(%RSI),%RAX |
0x2103bc MOV %RAX,-0x48(%RBP) |
0x2103c0 CMP -0x78(%RBP),%RSI |
0x2103c4 JGE 2102b0 |
0x2103ca CMPL $0,-0x68(%RBP) |
0x2103ce JLE 2103b0 |
0x2103d0 MOVSXD -0x3c(%RBP),%RAX |
0x2103d4 MOV -0x70(%RBP),%R9 |
0x2103d8 MOV -0x48(%RBP),%RSI |
0x2103dc MOV -0x60(%RBP),%R10 |
0x2103e0 MOV %RAX,-0x80(%RBP) |
0x2103e4 MOV 0x80(%R9),%RAX |
0x2103eb MOV (%R10,%RSI,4),%EDI |
0x2103ef MOV %ESI,%R8D |
0x2103f2 SAL $0x6,%R8D |
0x2103f6 MOV (%RAX,%RSI,8),%RAX |
0x2103fa ADD %EDI,%R8D |
0x2103fd XOR %ESI,%ESI |
0x2103ff MOV %EDI,-0x50(%RBP) |
0x210402 MOV %RAX,-0x88(%RBP) |
0x210409 MOVSXD %R8D,%RAX |
0x21040c MOV %RAX,-0xa8(%RBP) |
0x210413 JMP 210430 |
(107) 0x210420 MOV -0x90(%RBP),%RSI |
(107) 0x210427 INC %RSI |
(107) 0x21042a CMP -0x68(%RBP),%RSI |
(107) 0x21042e JE 2103b0 |
(107) 0x210430 MOV -0x88(%RBP),%RAX |
(107) 0x210437 MOV (%RAX,%RSI,4),%EAX |
(107) 0x21043a TEST %EAX,%EAX |
(107) 0x21043c JS 2105bc |
(107) 0x210442 CMPL $0,-0x50(%RBP) |
(107) 0x210446 MOV %RSI,-0x90(%RBP) |
(107) 0x21044d JLE 210420 |
(107) 0x21044f MOV -0x60(%RBP),%RSI |
(107) 0x210453 MOV (%RSI,%RAX,4),%R12D |
(107) 0x210457 SAL $0x6,%EAX |
(107) 0x21045a MOVSXD %EAX,%RDI |
(107) 0x21045d LEA (%R12,%RAX,1),%ESI |
(107) 0x210461 LEA (,%RDI,8),%RAX |
(107) 0x210469 MOVSXD %ESI,%R10 |
(107) 0x21046c LEA (%RAX,%RAX,2),%RAX |
(107) 0x210470 MOV %RAX,-0xa0(%RBP) |
(107) 0x210477 MOV -0x80(%RBP),%RAX |
(107) 0x21047b JMP 21048c |
(108) 0x210480 INC %RAX |
(108) 0x210483 CMP -0xa8(%RBP),%RAX |
(108) 0x21048a JGE 210420 |
(108) 0x21048c TEST %R12D,%R12D |
(108) 0x21048f JLE 210480 |
(108) 0x210491 MOV -0x98(%RBP),%RSI |
(108) 0x210498 MOV -0xa0(%RBP),%R11 |
(108) 0x21049f LEA (%RAX,%RAX,2),%R13 |
(108) 0x2104a3 MOV 0x20(%RSI),%R9 |
(108) 0x2104a7 MOV 0x18(%R9),%R8 |
(108) 0x2104ab LEA (%R8,%R13,8),%RSI |
(108) 0x2104af LEA 0x10(%R8,%R11,1),%RBX |
(108) 0x2104b4 MOV %RDI,%R8 |
(108) 0x2104b7 JMP 2104cc |
(109) 0x2104c0 INC %R8 |
(109) 0x2104c3 ADD $0x18,%RBX |
(109) 0x2104c7 CMP %R10,%R8 |
(109) 0x2104ca JGE 210480 |
(109) 0x2104cc VMOVUPD (%RSI),%XMM7 |
(109) 0x2104d0 VSUBPD -0x10(%RBX),%XMM7,%XMM7 |
(109) 0x2104d5 VMOVAPD %XMM7,%XMM8 |
(109) 0x2104d9 VFMADD213SD %XMM0,%XMM7,%XMM8 |
(109) 0x2104de VPERMILPD $0x1,%XMM7,%XMM9 |
(109) 0x2104e4 VFMADD213SD %XMM8,%XMM9,%XMM9 |
(109) 0x2104e9 VMOVSD 0x10(%RSI),%XMM8 |
(109) 0x2104ee VSUBSD (%RBX),%XMM8,%XMM8 |
(109) 0x2104f2 VFMADD231SD %XMM8,%XMM8,%XMM9 |
(109) 0x2104f7 VUCOMISD %XMM0,%XMM9 |
(109) 0x2104fb JBE 2104c0 |
(109) 0x2104fd VMOVSD (%R15),%XMM10 |
(109) 0x210502 VUCOMISD %XMM9,%XMM10 |
(109) 0x210507 JB 2104c0 |
(109) 0x210509 VDIVSD %XMM9,%XMM1,%XMM9 |
(109) 0x21050e MOV 0x30(%R9),%R11 |
(109) 0x210512 VMOVSD (%R11,%RAX,8),%XMM12 |
(109) 0x210518 VMULSD %XMM9,%XMM9,%XMM10 |
(109) 0x21051d VMULSD %XMM10,%XMM9,%XMM10 |
(109) 0x210522 VMULSD (%R14),%XMM10,%XMM10 |
(109) 0x210527 VADDSD %XMM2,%XMM10,%XMM11 |
(109) 0x21052b VFMSUB213SD (%RDX),%XMM10,%XMM11 |
(109) 0x210530 VFMADD231SD %XMM3,%XMM11,%XMM12 |
(109) 0x210535 VMOVSD %XMM12,(%R11,%RAX,8) |
(109) 0x21053b VFMADD213SD -0x38(%RBP),%XMM3,%XMM11 |
(109) 0x210541 VMOVSD %XMM11,-0x38(%RBP) |
(109) 0x210546 VMULSD (%RCX),%XMM4,%XMM11 |
(109) 0x21054a MOV 0x28(%R9),%R11 |
(109) 0x21054e VMULSD %XMM11,%XMM10,%XMM11 |
(109) 0x210553 VFMADD213SD %XMM6,%XMM5,%XMM10 |
(109) 0x210558 VMULSD %XMM11,%XMM9,%XMM9 |
(109) 0x21055d VMULSD %XMM9,%XMM10,%XMM9 |
(109) 0x210562 VMOVDDUP %XMM9,%XMM10 |
(109) 0x210567 VFNMADD213PD (%R11,%R13,8),%XMM7,%XMM10 |
(109) 0x21056d VMOVUPD %XMM10,(%R11,%R13,8) |
(109) 0x210573 VFNMADD213SD 0x10(%R11,%R13,8),%XMM9,%XMM8 |
(109) 0x21057a VMOVSD %XMM8,0x10(%R11,%R13,8) |
(109) 0x210581 JMP 2104c0 |
/beegfs/hackathon/users/eoseret/qaas_runs/170-850-7424/intel/CoMD/build/CoMD/CoMD/src-openmp/ljForce.c: 173 - 216 |
-------------------------------------------------------------------------------- |
173: for (int iBox=0; iBox<s->boxes->nLocalBoxes; iBox++) |
174: { |
175: int nIBox = s->boxes->nAtoms[iBox]; |
176: |
177: // loop over neighbors of iBox |
178: for (int jTmp=0; jTmp<nNbrBoxes; jTmp++) |
179: { |
180: int jBox = s->boxes->nbrBoxes[iBox][jTmp]; |
181: |
182: assert(jBox>=0); |
183: |
184: int nJBox = s->boxes->nAtoms[jBox]; |
185: |
186: // loop over atoms in iBox |
187: for (int iOff=MAXATOMS*iBox; iOff<(iBox*MAXATOMS+nIBox); iOff++) |
188: { |
189: |
190: // loop over atoms in jBox |
191: for (int jOff=jBox*MAXATOMS; jOff<(jBox*MAXATOMS+nJBox); jOff++) |
[...] |
197: dr[m] = s->atoms->r[iOff][m]-s->atoms->r[jOff][m]; |
198: r2+=dr[m]*dr[m]; |
199: } |
200: |
201: if ( r2 <= rCut2 && r2 > 0.0) |
202: { |
203: |
204: // Important note: |
205: // from this point on r actually refers to 1.0/r |
206: r2 = 1.0/r2; |
207: real_t r6 = s6 * (r2*r2*r2); |
208: real_t eLocal = r6 * (r6 - 1.0) - eShift; |
209: s->atoms->U[iOff] += 0.5*eLocal; |
210: ePot += 0.5*eLocal; |
211: |
212: // different formulation to avoid sqrt computation |
213: real_t fr = - 4.0*epsilon*r6*r2*(12.0*r6 - 6.0); |
214: for (int m=0; m<3; m++) |
215: { |
216: s->atoms->f[iOff][m] -= dr[m]*fr; |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 5.67 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.21 |
Bottlenecks | |
Function | .omp_outlined..5.120 |
Source | ljForce.c:173-175,ljForce.c:178-178 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 2.83 |
CQA cycles if no scalar integer | 2.83 |
CQA cycles if FP arith vectorized | 2.83 |
CQA cycles if fully vectorized | 0.50 |
Front-end cycles | 2.33 |
DIV/SQRT cycles | 1.38 |
P0 cycles | 1.38 |
P1 cycles | 1.38 |
P2 cycles | 1.38 |
P3 cycles | 1.00 |
P4 cycles | 2.83 |
P5 cycles | 2.83 |
P6 cycles | 2.83 |
P7 cycles | 0.00 |
P8 cycles | 0.00 |
P9 cycles | 0.00 |
P10 cycles | 0.00 |
P11 cycles | 0.00 |
P12 cycles | 0.00 |
P13 cycles | 0.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 13.50 |
Nb uops | 14.00 |
Nb loads | 6.00 |
Nb stores | 3.00 |
Nb stack references | 6.50 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 20.80 |
Bytes prefetched | 0.00 |
Bytes loaded | 38.00 |
Bytes stored | 20.00 |
Stride 0 | 1.00 |
Stride 1 | 0.00 |
Stride n | 0.00 |
Stride unknown | 1.50 |
Stride indirect | 0.50 |
Vectorization ratio all | 0.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | NA |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 10.21 |
Vector-efficiency ratio load | 7.81 |
Vector-efficiency ratio store | 10.16 |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | NA |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 9.38 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 5.49 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.26 |
Bottlenecks | P5, P6, P7, |
Function | .omp_outlined..5.120 |
Source | ljForce.c:173-175,ljForce.c:178-178 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 4.00 |
CQA cycles if no scalar integer | 4.00 |
CQA cycles if FP arith vectorized | 4.00 |
CQA cycles if fully vectorized | 0.73 |
Front-end cycles | 3.17 |
DIV/SQRT cycles | 1.50 |
P0 cycles | 1.50 |
P1 cycles | 1.50 |
P2 cycles | 1.50 |
P3 cycles | 1.00 |
P4 cycles | 4.00 |
P5 cycles | 4.00 |
P6 cycles | 4.00 |
P7 cycles | 0.00 |
P8 cycles | 0.00 |
P9 cycles | 0.00 |
P10 cycles | 0.00 |
P11 cycles | 0.00 |
P12 cycles | 0.00 |
P13 cycles | 0.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 19.00 |
Nb uops | 19.00 |
Nb loads | 8.00 |
Nb stores | 4.00 |
Nb stack references | 9.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 20.00 |
Bytes prefetched | 0.00 |
Bytes loaded | 52.00 |
Bytes stored | 28.00 |
Stride 0 | 1.00 |
Stride 1 | 0.00 |
Stride n | 0.00 |
Stride unknown | 3.00 |
Stride indirect | 1.00 |
Vectorization ratio all | 0.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | NA |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 10.00 |
Vector-efficiency ratio load | 6.25 |
Vector-efficiency ratio store | 10.94 |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | NA |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 6.25 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 6.15 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.11 |
Bottlenecks | P5, P6, P7, |
Function | .omp_outlined..5.120 |
Source | ljForce.c:173-175,ljForce.c:178-178 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 1.67 |
CQA cycles if no scalar integer | 1.67 |
CQA cycles if FP arith vectorized | 1.67 |
CQA cycles if fully vectorized | 0.27 |
Front-end cycles | 1.50 |
DIV/SQRT cycles | 1.25 |
P0 cycles | 1.25 |
P1 cycles | 1.25 |
P2 cycles | 1.25 |
P3 cycles | 1.00 |
P4 cycles | 1.67 |
P5 cycles | 1.67 |
P6 cycles | 1.67 |
P7 cycles | 0.00 |
P8 cycles | 0.00 |
P9 cycles | 0.00 |
P10 cycles | 0.00 |
P11 cycles | 0.00 |
P12 cycles | 0.00 |
P13 cycles | 0.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 8.00 |
Nb uops | 9.00 |
Nb loads | 4.00 |
Nb stores | 2.00 |
Nb stack references | 4.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 21.60 |
Bytes prefetched | 0.00 |
Bytes loaded | 24.00 |
Bytes stored | 12.00 |
Stride 0 | 1.00 |
Stride 1 | 0.00 |
Stride n | 0.00 |
Stride unknown | 0.00 |
Stride indirect | 0.00 |
Vectorization ratio all | 0.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | NA |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 10.42 |
Vector-efficiency ratio load | 9.38 |
Vector-efficiency ratio store | 9.38 |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | NA |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 12.50 |
Path / |
Function | .omp_outlined..5.120 |
Source file and lines | ljForce.c:173-216 |
Module | exec |
nb instructions | 13.50 |
nb uops | 14 |
loop length | 53.50 |
used x86 registers | 5 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 6.50 |
micro-operation queue | 2.33 cycles |
front end | 2.33 cycles |
ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 1.38 | 1.38 | 1.38 | 1.38 | 1.00 | 2.83 | 2.83 | 2.83 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
cycles | 1.38 | 1.38 | 1.38 | 1.38 | 1.00 | 2.83 | 2.83 | 2.83 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 0.00 |
Front-end | 2.33 |
Dispatch | 2.83 |
Data deps. | 0.00 |
Overall L1 | 2.83 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 10% |
load | 7% |
store | 10% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 9% |
Function | .omp_outlined..5.120 |
Source file and lines | ljForce.c:173-216 |
Module | exec |
nb instructions | 19 |
nb uops | 19 |
loop length | 75 |
used x86 registers | 7 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 9 |
micro-operation queue | 3.17 cycles |
front end | 3.17 cycles |
ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 1.50 | 1.50 | 1.50 | 1.50 | 1.00 | 4.00 | 4.00 | 4.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
cycles | 1.50 | 1.50 | 1.50 | 1.50 | 1.00 | 4.00 | 4.00 | 4.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 0.00 |
Front-end | 3.17 |
Dispatch | 4.00 |
Data deps. | 0.00 |
Overall L1 | 4.00 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 10% |
load | 6% |
store | 10% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 6% |
Instruction | Nb FU | ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
CMPL $0,-0x68(%RBP) | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
JLE 2103b0 <.omp_outlined..5.120+0x1a0> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
MOVSXD -0x3c(%RBP),%RAX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
MOV -0x70(%RBP),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV -0x48(%RBP),%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV -0x60(%RBP),%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV %RAX,-0x80(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV 0x80(%R9),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV (%R10,%RSI,4),%EDI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV %ESI,%R8D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
SAL $0x6,%R8D | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
MOV (%RAX,%RSI,8),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
ADD %EDI,%R8D | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
XOR %ESI,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %EDI,-0x50(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV %RAX,-0x88(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOVSXD %R8D,%RAX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV %RAX,-0xa8(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
JMP 210430 <.omp_outlined..5.120+0x220> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
Function | .omp_outlined..5.120 |
Source file and lines | ljForce.c:173-216 |
Module | exec |
nb instructions | 8 |
nb uops | 9 |
loop length | 32 |
used x86 registers | 3 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 4 |
micro-operation queue | 1.50 cycles |
front end | 1.50 cycles |
ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 1.25 | 1.25 | 1.25 | 1.25 | 1.00 | 1.67 | 1.67 | 1.67 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
cycles | 1.25 | 1.25 | 1.25 | 1.25 | 1.00 | 1.67 | 1.67 | 1.67 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 0.00 |
Front-end | 1.50 |
Dispatch | 1.67 |
Data deps. | 0.00 |
Overall L1 | 1.67 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 10% |
load | 9% |
store | 9% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
Instruction | Nb FU | ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MOV -0x48(%RBP),%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
ADDL $0x40,-0x3c(%RBP) | 2 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
LEA 0x1(%RSI),%RAX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV %RAX,-0x48(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMP -0x78(%RBP),%RSI | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
JGE 2102b0 <.omp_outlined..5.120+0xa0> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
CMPL $0,-0x68(%RBP) | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
JLE 2103b0 <.omp_outlined..5.120+0x1a0> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |