Loop Id: 52 | Module: exec | Source: linkCells.c:178-378 [...] | Coverage: 0.11% |
---|
Loop Id: 52 | Module: exec | Source: linkCells.c:178-378 [...] | Coverage: 0.11% |
---|
0x20c0c0 INC %EDI |
0x20c0c2 MOV %EDX,%EAX |
0x20c0c4 SAL $0x6,%EAX |
0x20c0c7 VMOVAPS -0x80(%RBP),%XMM0 [10] |
0x20c0cc VMOVAPS -0x70(%RBP),%XMM2 [10] |
0x20c0d1 VMOVSD -0x48(%RBP),%XMM1 [10] |
0x20c0d6 ADD $0x38,%R13 |
0x20c0da MOV %EDI,(%RCX,%RDX,4) [1] |
0x20c0dd ADD %EAX,%ESI |
0x20c0df MOV -0x2c(%RBP),%EDX [10] |
0x20c0e2 MOV -0x40(%RBP),%RDI [10] |
0x20c0e6 MOV 0x8(%R12),%RAX [4] |
0x20c0eb MOVSXD %ESI,%RCX |
0x20c0ee MOV %R14D,(%RAX,%RCX,4) [5] |
0x20c0f2 MOV 0x10(%R12),%RAX [4] |
0x20c0f7 MOV %EDX,(%RAX,%RCX,4) [7] |
0x20c0fa SAL $0x3,%RCX |
0x20c0fe MOV 0x18(%R12),%RAX [4] |
0x20c103 LEA (%RCX,%RCX,2),%RCX |
0x20c107 VMOVUPS %XMM0,(%RAX,%RCX,1) [8] |
0x20c10c VMOVAPS -0x60(%RBP),%XMM0 [10] |
0x20c111 VMOVSD %XMM0,0x10(%RAX,%RCX,1) [8] |
0x20c117 MOV 0x20(%R12),%RAX [4] |
0x20c11c VMOVUPS %XMM2,(%RAX,%RCX,1) [9] |
0x20c121 VMOVSD %XMM1,0x10(%RAX,%RCX,1) [9] |
0x20c127 CMP %R13,-0x38(%RBP) [10] |
0x20c12b JE 20c22c |
0x20c131 MOV 0x18(%RDI),%R15 [3] |
0x20c135 VMOVUPD 0x8(%RBX,%R13,1),%XMM2 [2] |
0x20c13c VMOVSD 0x30(%R15),%XMM0 [6] |
0x20c142 VUCOMISD %XMM2,%XMM0 |
0x20c146 JBE 20c170 |
0x20c148 VSUBSD 0x18(%R15),%XMM2,%XMM0 [6] |
0x20c14e MOV (%R15),%ECX [6] |
0x20c151 VMULSD 0x60(%R15),%XMM0,%XMM0 [6] |
0x20c157 LEA -0x1(%RCX),%ESI |
0x20c15a VROUNDSD $0x9,%XMM0,%XMM0,%XMM0 |
0x20c160 VCVTTSD2SI %XMM0,%EAX |
0x20c164 CMP %EAX,%ECX |
0x20c166 CMOVNE %EAX,%ESI |
0x20c169 JMP 20c173 |
0x20c170 MOV (%R15),%ESI [6] |
0x20c173 VMOVUPS 0x20(%RBX,%R13,1),%XMM0 [2] |
0x20c17a VMOVSD 0x18(%RBX,%R13,1),%XMM3 [2] |
0x20c181 VMOVQ 0x4(%R15),%XMM1 [6] |
0x20c187 VMOVAPD %XMM2,-0x80(%RBP) [10] |
0x20c18c MOV 0x4(%RBX,%R13,1),%EAX [2] |
0x20c191 MOV 0x20(%RDI),%R12 [3] |
0x20c195 MOV (%RBX,%R13,1),%R14D [2] |
0x20c199 MOV %R15,%RDI |
0x20c19c MOV %EAX,-0x2c(%RBP) [10] |
0x20c19f VMOVAPS %XMM0,-0x70(%RBP) [10] |
0x20c1a4 VMOVSD 0x30(%RBX,%R13,1),%XMM0 [2] |
0x20c1ab VSHUFPD $0x1,%XMM3,%XMM2,%XMM2 |
0x20c1b0 VCMPPD $0x1,0x38(%R15),%XMM2,%K2 [6] |
0x20c1bb VMOVAPD %XMM3,-0x60(%RBP) [10] |
0x20c1c0 VPCMPEQD %XMM2,%XMM2,%XMM2 |
0x20c1c4 VMOVSD %XMM0,-0x48(%RBP) [10] |
0x20c1c9 VMOVDDUP 0x10(%RBX,%R13,1),%XMM0 [2] |
0x20c1d0 VUNPCKLPD %XMM3,%XMM0,%XMM0 |
0x20c1d4 VSUBPD 0x20(%R15),%XMM0,%XMM0 [6] |
0x20c1da VMULPD 0x68(%R15),%XMM0,%XMM0 [6] |
0x20c1e0 VROUNDPD $0x9,%XMM0,%XMM0 |
0x20c1e6 VCVTTPD2DQ %XMM0,%XMM0 |
0x20c1ea VPCMPEQD %XMM0,%XMM1,%K1 |
0x20c1f0 VPADDD %XMM2,%XMM1,%XMM0{%K1} |
0x20c1f6 VMOVDQA32 %XMM0,%XMM1{%K2} |
0x20c1fc VMOVD %XMM1,%EDX |
0x20c200 VPEXTRD $0x1,%XMM1,%ECX |
0x20c206 CALL 20f6b0 <getBoxFromTuple> |
0x20c20b MOV 0x78(%R15),%RCX [6] |
0x20c20f MOV %EAX,%EDX |
0x20c211 MOV (%RCX,%RDX,4),%ESI [1] |
0x20c214 MOV %ESI,%EDI |
0x20c216 CMP 0xc(%R15),%EAX [6] |
0x20c21a JGE 20c0c0 |
0x20c220 INCL (%R12) [4] |
0x20c224 MOV (%RCX,%RDX,4),%EDI [1] |
0x20c227 JMP 20c0c0 |
/beegfs/hackathon/users/eoseret/qaas_runs/170-850-7424/intel/CoMD/build/CoMD/CoMD/src-openmp/haloExchange.c: 414 - 424 |
-------------------------------------------------------------------------------- |
414: for (int ii=0; ii<nBuf; ++ii) |
415: { |
416: int gid = buf[ii].gid; |
417: int type = buf[ii].type; |
418: real_t rx = buf[ii].rx; |
[...] |
424: putAtomInBox(s->boxes, s->atoms, gid, type, rx, ry, rz, px, py, pz); |
/beegfs/hackathon/users/eoseret/qaas_runs/170-850-7424/intel/CoMD/build/CoMD/CoMD/src-openmp/linkCells.c: 178 - 378 |
-------------------------------------------------------------------------------- |
178: int iOff = iBox*MAXATOMS; |
179: iOff += boxes->nAtoms[iBox]; |
180: |
181: // assign values to array elements |
182: if (iBox < boxes->nLocalBoxes) |
183: atoms->nLocal++; |
184: boxes->nAtoms[iBox]++; |
185: atoms->gid[iOff] = gid; |
186: atoms->iSpecies[iOff] = iType; |
187: |
188: atoms->r[iOff][0] = x; |
189: atoms->r[iOff][1] = y; |
190: atoms->r[iOff][2] = z; |
191: |
192: atoms->p[iOff][0] = px; |
193: atoms->p[iOff][1] = py; |
194: atoms->p[iOff][2] = pz; |
[...] |
352: int ix = (int)(floor((rr[0] - localMin[0])*boxes->invBoxSize[0])); |
[...] |
359: if (rr[0] < localMax[0]) |
360: { |
361: if (ix == gridSize[0]) ix = gridSize[0] - 1; |
362: } |
363: else |
364: ix = gridSize[0]; // assign to halo cell |
365: if (rr[1] < localMax[1]) |
[...] |
371: if (rr[2] < localMax[2]) |
[...] |
378: return getBoxFromTuple(boxes, ix, iy, iz); |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.80 |
CQA speedup if FP arith vectorized | 1.30 |
CQA speedup if fully vectorized | 4.08 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.14 |
Bottlenecks | |
Function | unloadAtomsBuffer |
Source | haloExchange.c:414-418,haloExchange.c:424-424,linkCells.c:178-194,linkCells.c:352-352,linkCells.c:359-365,linkCells.c:371-371,linkCells.c:378-378 |
Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 14.67 |
CQA cycles if no scalar integer | 8.17 |
CQA cycles if FP arith vectorized | 11.30 |
CQA cycles if fully vectorized | 3.59 |
Front-end cycles | 12.83 |
DIV/SQRT cycles | 3.31 |
P0 cycles | 3.31 |
P1 cycles | 3.19 |
P2 cycles | 3.19 |
P3 cycles | 2.50 |
P4 cycles | 14.67 |
P5 cycles | 14.67 |
P6 cycles | 14.67 |
P7 cycles | 3.50 |
P8 cycles | 3.54 |
P9 cycles | 3.58 |
P10 cycles | 3.38 |
P11 cycles | 5.75 |
P12 cycles | 5.75 |
P13 cycles | 0.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 72.50 |
Nb uops | 77.00 |
Nb loads | 31.00 |
Nb stores | 12.50 |
Nb stack references | 7.00 |
FLOP/cycle | 0.34 |
Nb FLOP add-sub | 2.50 |
Nb FLOP mul | 2.50 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 27.69 |
Bytes prefetched | 0.00 |
Bytes loaded | 284.00 |
Bytes stored | 122.00 |
Stride 0 | 1.00 |
Stride 1 | 0.00 |
Stride n | 3.00 |
Stride unknown | 3.00 |
Stride indirect | 1.00 |
Vectorization ratio all | 44.59 |
Vectorization ratio load | 42.16 |
Vectorization ratio store | 41.67 |
Vectorization ratio mul | 75.00 |
Vectorization ratio add_sub | 58.33 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 49.28 |
Vector-efficiency ratio all | 16.55 |
Vector-efficiency ratio load | 16.78 |
Vector-efficiency ratio store | 15.63 |
Vector-efficiency ratio mul | 21.88 |
Vector-efficiency ratio add_sub | 17.97 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 17.19 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.84 |
CQA speedup if FP arith vectorized | 1.32 |
CQA speedup if fully vectorized | 4.13 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.10 |
Bottlenecks | P5, P6, P7, |
Function | unloadAtomsBuffer |
Source | haloExchange.c:414-418,haloExchange.c:424-424,linkCells.c:178-194,linkCells.c:352-352,linkCells.c:359-365,linkCells.c:371-371,linkCells.c:378-378 |
Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 15.33 |
CQA cycles if no scalar integer | 8.33 |
CQA cycles if FP arith vectorized | 11.58 |
CQA cycles if fully vectorized | 3.71 |
Front-end cycles | 14.00 |
DIV/SQRT cycles | 4.00 |
P0 cycles | 4.00 |
P1 cycles | 4.00 |
P2 cycles | 4.00 |
P3 cycles | 3.00 |
P4 cycles | 15.33 |
P5 cycles | 15.33 |
P6 cycles | 15.33 |
P7 cycles | 4.00 |
P8 cycles | 4.00 |
P9 cycles | 4.17 |
P10 cycles | 3.83 |
P11 cycles | 6.00 |
P12 cycles | 6.00 |
P13 cycles | 0.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 78.00 |
Nb uops | 84.00 |
Nb loads | 33.00 |
Nb stores | 13.00 |
Nb stack references | 7.00 |
FLOP/cycle | 0.39 |
Nb FLOP add-sub | 3.00 |
Nb FLOP mul | 3.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 27.39 |
Bytes prefetched | 0.00 |
Bytes loaded | 296.00 |
Bytes stored | 124.00 |
Stride 0 | 1.00 |
Stride 1 | 0.00 |
Stride n | 3.00 |
Stride unknown | 3.00 |
Stride indirect | 1.00 |
Vectorization ratio all | 41.67 |
Vectorization ratio load | 40.00 |
Vectorization ratio store | 41.67 |
Vectorization ratio mul | 50.00 |
Vectorization ratio add_sub | 50.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 41.18 |
Vector-efficiency ratio all | 16.15 |
Vector-efficiency ratio load | 16.56 |
Vector-efficiency ratio store | 15.63 |
Vector-efficiency ratio mul | 18.75 |
Vector-efficiency ratio add_sub | 17.19 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 15.81 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.69 |
CQA speedup if FP arith vectorized | 1.31 |
CQA speedup if fully vectorized | 4.02 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.11 |
Bottlenecks | P5, P6, P7, |
Function | unloadAtomsBuffer |
Source | haloExchange.c:414-418,haloExchange.c:424-424,linkCells.c:178-194,linkCells.c:352-352,linkCells.c:359-365,linkCells.c:371-371,linkCells.c:378-378 |
Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 14.67 |
CQA cycles if no scalar integer | 8.67 |
CQA cycles if FP arith vectorized | 11.21 |
CQA cycles if fully vectorized | 3.65 |
Front-end cycles | 13.17 |
DIV/SQRT cycles | 3.50 |
P0 cycles | 3.50 |
P1 cycles | 3.25 |
P2 cycles | 3.25 |
P3 cycles | 2.50 |
P4 cycles | 14.67 |
P5 cycles | 14.67 |
P6 cycles | 14.67 |
P7 cycles | 4.00 |
P8 cycles | 4.00 |
P9 cycles | 4.17 |
P10 cycles | 3.83 |
P11 cycles | 6.00 |
P12 cycles | 6.00 |
P13 cycles | 0.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 75.00 |
Nb uops | 79.00 |
Nb loads | 31.00 |
Nb stores | 12.00 |
Nb stack references | 7.00 |
FLOP/cycle | 0.41 |
Nb FLOP add-sub | 3.00 |
Nb FLOP mul | 3.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 27.82 |
Bytes prefetched | 0.00 |
Bytes loaded | 288.00 |
Bytes stored | 120.00 |
Stride 0 | 1.00 |
Stride 1 | 0.00 |
Stride n | 3.00 |
Stride unknown | 3.00 |
Stride indirect | 1.00 |
Vectorization ratio all | 42.55 |
Vectorization ratio load | 42.11 |
Vectorization ratio store | 41.67 |
Vectorization ratio mul | 50.00 |
Vectorization ratio add_sub | 50.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 43.75 |
Vector-efficiency ratio all | 16.36 |
Vector-efficiency ratio load | 17.11 |
Vector-efficiency ratio store | 15.63 |
Vector-efficiency ratio mul | 18.75 |
Vector-efficiency ratio add_sub | 17.19 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 16.41 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.91 |
CQA speedup if FP arith vectorized | 1.28 |
CQA speedup if fully vectorized | 4.14 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.17 |
Bottlenecks | P5, P6, P7, |
Function | unloadAtomsBuffer |
Source | haloExchange.c:414-418,haloExchange.c:424-424,linkCells.c:178-194,linkCells.c:352-352,linkCells.c:359-365,linkCells.c:371-371,linkCells.c:378-378 |
Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 14.67 |
CQA cycles if no scalar integer | 7.67 |
CQA cycles if FP arith vectorized | 11.42 |
CQA cycles if fully vectorized | 3.54 |
Front-end cycles | 12.50 |
DIV/SQRT cycles | 3.25 |
P0 cycles | 3.25 |
P1 cycles | 3.00 |
P2 cycles | 3.00 |
P3 cycles | 2.50 |
P4 cycles | 14.67 |
P5 cycles | 14.67 |
P6 cycles | 14.67 |
P7 cycles | 3.00 |
P8 cycles | 3.08 |
P9 cycles | 3.00 |
P10 cycles | 2.92 |
P11 cycles | 5.50 |
P12 cycles | 5.50 |
P13 cycles | 0.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 70.00 |
Nb uops | 75.00 |
Nb loads | 31.00 |
Nb stores | 13.00 |
Nb stack references | 7.00 |
FLOP/cycle | 0.27 |
Nb FLOP add-sub | 2.00 |
Nb FLOP mul | 2.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 27.55 |
Bytes prefetched | 0.00 |
Bytes loaded | 280.00 |
Bytes stored | 124.00 |
Stride 0 | 1.00 |
Stride 1 | 0.00 |
Stride n | 3.00 |
Stride unknown | 3.00 |
Stride indirect | 1.00 |
Vectorization ratio all | 46.51 |
Vectorization ratio load | 42.11 |
Vectorization ratio store | 41.67 |
Vectorization ratio mul | 100.00 |
Vectorization ratio add_sub | 66.67 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 53.85 |
Vector-efficiency ratio all | 16.72 |
Vector-efficiency ratio load | 16.45 |
Vector-efficiency ratio store | 15.63 |
Vector-efficiency ratio mul | 25.00 |
Vector-efficiency ratio add_sub | 18.75 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 17.79 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.75 |
CQA speedup if FP arith vectorized | 1.27 |
CQA speedup if fully vectorized | 4.02 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.20 |
Bottlenecks | P5, P6, P7, |
Function | unloadAtomsBuffer |
Source | haloExchange.c:414-418,haloExchange.c:424-424,linkCells.c:178-194,linkCells.c:352-352,linkCells.c:359-365,linkCells.c:371-371,linkCells.c:378-378 |
Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 14.00 |
CQA cycles if no scalar integer | 8.00 |
CQA cycles if FP arith vectorized | 11.00 |
CQA cycles if fully vectorized | 3.48 |
Front-end cycles | 11.67 |
DIV/SQRT cycles | 2.50 |
P0 cycles | 2.50 |
P1 cycles | 2.50 |
P2 cycles | 2.50 |
P3 cycles | 2.00 |
P4 cycles | 14.00 |
P5 cycles | 14.00 |
P6 cycles | 14.00 |
P7 cycles | 3.00 |
P8 cycles | 3.08 |
P9 cycles | 3.00 |
P10 cycles | 2.92 |
P11 cycles | 5.50 |
P12 cycles | 5.50 |
P13 cycles | 0.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 67.00 |
Nb uops | 70.00 |
Nb loads | 29.00 |
Nb stores | 12.00 |
Nb stack references | 7.00 |
FLOP/cycle | 0.29 |
Nb FLOP add-sub | 2.00 |
Nb FLOP mul | 2.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 28.00 |
Bytes prefetched | 0.00 |
Bytes loaded | 272.00 |
Bytes stored | 120.00 |
Stride 0 | 1.00 |
Stride 1 | 0.00 |
Stride n | 3.00 |
Stride unknown | 3.00 |
Stride indirect | 1.00 |
Vectorization ratio all | 47.62 |
Vectorization ratio load | 44.44 |
Vectorization ratio store | 41.67 |
Vectorization ratio mul | 100.00 |
Vectorization ratio add_sub | 66.67 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 58.33 |
Vector-efficiency ratio all | 16.96 |
Vector-efficiency ratio load | 17.01 |
Vector-efficiency ratio store | 15.63 |
Vector-efficiency ratio mul | 25.00 |
Vector-efficiency ratio add_sub | 18.75 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 18.75 |
Path / |
Function | unloadAtomsBuffer |
Source file and lines | linkCells.c:178-378 |
Module | exec |
nb instructions | 72.50 |
nb uops | 77 |
loop length | 334 |
used x86 registers | 11 |
used mmx registers | 0 |
used xmm registers | 4 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 7 |
ADD-SUB / MUL ratio | 1.00 |
micro-operation queue | 12.83 cycles |
front end | 12.83 cycles |
ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 3.31 | 3.31 | 3.19 | 3.19 | 2.50 | 14.67 | 14.67 | 14.67 | 3.50 | 3.54 | 3.58 | 3.38 | 5.75 | 5.75 |
cycles | 3.31 | 3.31 | 3.19 | 3.19 | 2.50 | 14.67 | 14.67 | 14.67 | 3.50 | 3.54 | 3.58 | 3.38 | 5.75 | 5.75 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 0.00 |
Front-end | 12.83 |
Dispatch | 14.67 |
Data deps. | 0.00 |
Overall L1 | 14.67 |
all | 23% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 50% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 40% |
all | 57% |
load | 57% |
store | 62% |
mul | 75% |
add-sub | 75% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 58% |
all | 44% |
load | 42% |
store | 41% |
mul | 75% |
add-sub | 58% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 49% |
all | 11% |
load | 8% |
store | 6% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 15% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 14% |
all | 19% |
load | 19% |
store | 20% |
mul | 21% |
add-sub | 21% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 19% |
all | 16% |
load | 16% |
store | 15% |
mul | 21% |
add-sub | 17% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 17% |
Function | unloadAtomsBuffer |
Source file and lines | linkCells.c:178-378 |
Module | exec |
nb instructions | 78 |
nb uops | 84 |
loop length | 356 |
used x86 registers | 11 |
used mmx registers | 0 |
used xmm registers | 4 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 7 |
ADD-SUB / MUL ratio | 1.00 |
micro-operation queue | 14.00 cycles |
front end | 14.00 cycles |
ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 4.00 | 4.00 | 4.00 | 4.00 | 3.00 | 15.33 | 15.33 | 15.33 | 4.00 | 4.00 | 4.17 | 3.83 | 6.00 | 6.00 |
cycles | 4.00 | 4.00 | 4.00 | 4.00 | 3.00 | 15.33 | 15.33 | 15.33 | 4.00 | 4.00 | 4.17 | 3.83 | 6.00 | 6.00 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 0.00 |
Front-end | 14.00 |
Dispatch | 15.33 |
Data deps. | 0.00 |
Overall L1 | 15.33 |
all | 22% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 50% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 33% |
all | 53% |
load | 53% |
store | 62% |
mul | 50% |
add-sub | 50% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 50% |
all | 41% |
load | 40% |
store | 41% |
mul | 50% |
add-sub | 50% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 41% |
all | 11% |
load | 8% |
store | 6% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 15% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 13% |
all | 19% |
load | 19% |
store | 20% |
mul | 18% |
add-sub | 18% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 18% |
all | 16% |
load | 16% |
store | 15% |
mul | 18% |
add-sub | 17% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 15% |
Instruction | Nb FU | ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
INC %EDI | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV %EDX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
SAL $0x6,%EAX | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VMOVAPS -0x80(%RBP),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVAPS -0x70(%RBP),%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVSD -0x48(%RBP),%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
ADD $0x38,%R13 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV %EDI,(%RCX,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
ADD %EAX,%ESI | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV -0x2c(%RBP),%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV -0x40(%RBP),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV 0x8(%R12),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOVSXD %ESI,%RCX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV %R14D,(%RAX,%RCX,4) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV 0x10(%R12),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV %EDX,(%RAX,%RCX,4) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
SAL $0x3,%RCX | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
MOV 0x18(%R12),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
LEA (%RCX,%RCX,2),%RCX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
VMOVUPS %XMM0,(%RAX,%RCX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 4 | 1 |
VMOVAPS -0x60(%RBP),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVSD %XMM0,0x10(%RAX,%RCX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
MOV 0x20(%R12),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
VMOVUPS %XMM2,(%RAX,%RCX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 4 | 1 |
VMOVSD %XMM1,0x10(%RAX,%RCX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
CMP %R13,-0x38(%RBP) | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
JE 20c22c <unloadAtomsBuffer+0x1dc> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
MOV 0x18(%RDI),%R15 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
VMOVUPD 0x8(%RBX,%R13,1),%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVSD 0x30(%R15),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VUCOMISD %XMM2,%XMM0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0.50 | 6 | 1 |
JBE 20c170 <unloadAtomsBuffer+0x120> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
VSUBSD 0x18(%R15),%XMM2,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 0.50 |
MOV (%R15),%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
VMULSD 0x60(%R15),%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
LEA -0x1(%RCX),%ESI | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
VROUNDSD $0x9,%XMM0,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 0.50 |
VCVTTSD2SI %XMM0,%EAX | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0.50 | 8 | 1 |
CMP %EAX,%ECX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMOVNE %EAX,%ESI | 1 | 0.50 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
JMP 20c173 <unloadAtomsBuffer+0x123> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
VMOVUPS 0x20(%RBX,%R13,1),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVSD 0x18(%RBX,%R13,1),%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VMOVQ 0x4(%R15),%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VMOVAPD %XMM2,-0x80(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 4 | 1 |
MOV 0x4(%RBX,%R13,1),%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV 0x20(%RDI),%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV (%RBX,%R13,1),%R14D | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV %R15,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %EAX,-0x2c(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVAPS %XMM0,-0x70(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 4 | 1 |
VMOVSD 0x30(%RBX,%R13,1),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VSHUFPD $0x1,%XMM3,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 |
VCMPPD $0x1,0x38(%R15),%XMM2,%K2 | 2 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVAPD %XMM3,-0x60(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 4 | 1 |
VPCMPEQD %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.25 |
VMOVSD %XMM0,-0x48(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
VMOVDDUP 0x10(%RBX,%R13,1),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VUNPCKLPD %XMM3,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 |
VSUBPD 0x20(%R15),%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 0.50 |
VMULPD 0x68(%R15),%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VROUNDPD $0x9,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 0.50 |
VCVTTPD2DQ %XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 0.50 |
VPCMPEQD %XMM0,%XMM1,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VPADDD %XMM2,%XMM1,%XMM0{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.25 |
VMOVDQA32 %XMM0,%XMM1{%K2} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VMOVD %XMM1,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 5 | 1 |
VPEXTRD $0x1,%XMM1,%ECX | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 6 | 1 |
CALL 20f6b0 <getBoxFromTuple> | 2 | 0.50 | 0 | 0 | 0 | 0.50 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0x78(%R15),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV %EAX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV (%RCX,%RDX,4),%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV %ESI,%EDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
CMP 0xc(%R15),%EAX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
JGE 20c0c0 <unloadAtomsBuffer+0x70> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
INCL (%R12) | 2 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
MOV (%RCX,%RDX,4),%EDI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
JMP 20c0c0 <unloadAtomsBuffer+0x70> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
Function | unloadAtomsBuffer |
Source file and lines | linkCells.c:178-378 |
Module | exec |
nb instructions | 75 |
nb uops | 79 |
loop length | 344 |
used x86 registers | 11 |
used mmx registers | 0 |
used xmm registers | 4 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 7 |
ADD-SUB / MUL ratio | 1.00 |
micro-operation queue | 13.17 cycles |
front end | 13.17 cycles |
ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 3.50 | 3.50 | 3.25 | 3.25 | 2.50 | 14.67 | 14.67 | 14.67 | 4.00 | 4.00 | 4.17 | 3.83 | 6.00 | 6.00 |
cycles | 3.50 | 3.50 | 3.25 | 3.25 | 2.50 | 14.67 | 14.67 | 14.67 | 4.00 | 4.00 | 4.17 | 3.83 | 6.00 | 6.00 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 0.00 |
Front-end | 13.17 |
Dispatch | 14.67 |
Data deps. | 0.00 |
Overall L1 | 14.67 |
all | 23% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 50% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 37% |
all | 53% |
load | 53% |
store | 62% |
mul | 50% |
add-sub | 50% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 50% |
all | 42% |
load | 42% |
store | 41% |
mul | 50% |
add-sub | 50% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 43% |
all | 11% |
load | 9% |
store | 6% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 15% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 14% |
all | 19% |
load | 19% |
store | 20% |
mul | 18% |
add-sub | 18% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 18% |
all | 16% |
load | 17% |
store | 15% |
mul | 18% |
add-sub | 17% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 16% |
Instruction | Nb FU | ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
INC %EDI | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV %EDX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
SAL $0x6,%EAX | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VMOVAPS -0x80(%RBP),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVAPS -0x70(%RBP),%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVSD -0x48(%RBP),%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
ADD $0x38,%R13 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV %EDI,(%RCX,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
ADD %EAX,%ESI | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV -0x2c(%RBP),%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV -0x40(%RBP),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV 0x8(%R12),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOVSXD %ESI,%RCX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV %R14D,(%RAX,%RCX,4) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV 0x10(%R12),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV %EDX,(%RAX,%RCX,4) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
SAL $0x3,%RCX | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
MOV 0x18(%R12),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
LEA (%RCX,%RCX,2),%RCX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
VMOVUPS %XMM0,(%RAX,%RCX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 4 | 1 |
VMOVAPS -0x60(%RBP),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVSD %XMM0,0x10(%RAX,%RCX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
MOV 0x20(%R12),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
VMOVUPS %XMM2,(%RAX,%RCX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 4 | 1 |
VMOVSD %XMM1,0x10(%RAX,%RCX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
CMP %R13,-0x38(%RBP) | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
JE 20c22c <unloadAtomsBuffer+0x1dc> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
MOV 0x18(%RDI),%R15 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
VMOVUPD 0x8(%RBX,%R13,1),%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVSD 0x30(%R15),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VUCOMISD %XMM2,%XMM0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0.50 | 6 | 1 |
JBE 20c170 <unloadAtomsBuffer+0x120> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
VSUBSD 0x18(%R15),%XMM2,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 0.50 |
MOV (%R15),%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
VMULSD 0x60(%R15),%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
LEA -0x1(%RCX),%ESI | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
VROUNDSD $0x9,%XMM0,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 0.50 |
VCVTTSD2SI %XMM0,%EAX | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0.50 | 8 | 1 |
CMP %EAX,%ECX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMOVNE %EAX,%ESI | 1 | 0.50 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
JMP 20c173 <unloadAtomsBuffer+0x123> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
VMOVUPS 0x20(%RBX,%R13,1),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVSD 0x18(%RBX,%R13,1),%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VMOVQ 0x4(%R15),%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VMOVAPD %XMM2,-0x80(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 4 | 1 |
MOV 0x4(%RBX,%R13,1),%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV 0x20(%RDI),%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV (%RBX,%R13,1),%R14D | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV %R15,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %EAX,-0x2c(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVAPS %XMM0,-0x70(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 4 | 1 |
VMOVSD 0x30(%RBX,%R13,1),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VSHUFPD $0x1,%XMM3,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 |
VCMPPD $0x1,0x38(%R15),%XMM2,%K2 | 2 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVAPD %XMM3,-0x60(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 4 | 1 |
VPCMPEQD %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.25 |
VMOVSD %XMM0,-0x48(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
VMOVDDUP 0x10(%RBX,%R13,1),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VUNPCKLPD %XMM3,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 |
VSUBPD 0x20(%R15),%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 0.50 |
VMULPD 0x68(%R15),%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VROUNDPD $0x9,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 0.50 |
VCVTTPD2DQ %XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 0.50 |
VPCMPEQD %XMM0,%XMM1,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VPADDD %XMM2,%XMM1,%XMM0{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.25 |
VMOVDQA32 %XMM0,%XMM1{%K2} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VMOVD %XMM1,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 5 | 1 |
VPEXTRD $0x1,%XMM1,%ECX | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 6 | 1 |
CALL 20f6b0 <getBoxFromTuple> | 2 | 0.50 | 0 | 0 | 0 | 0.50 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0x78(%R15),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV %EAX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV (%RCX,%RDX,4),%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV %ESI,%EDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
CMP 0xc(%R15),%EAX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
JGE 20c0c0 <unloadAtomsBuffer+0x70> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
Function | unloadAtomsBuffer |
Source file and lines | linkCells.c:178-378 |
Module | exec |
nb instructions | 70 |
nb uops | 75 |
loop length | 324 |
used x86 registers | 11 |
used mmx registers | 0 |
used xmm registers | 4 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 7 |
ADD-SUB / MUL ratio | 1.00 |
micro-operation queue | 12.50 cycles |
front end | 12.50 cycles |
ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 3.25 | 3.25 | 3.00 | 3.00 | 2.50 | 14.67 | 14.67 | 14.67 | 3.00 | 3.08 | 3.00 | 2.92 | 5.50 | 5.50 |
cycles | 3.25 | 3.25 | 3.00 | 3.00 | 2.50 | 14.67 | 14.67 | 14.67 | 3.00 | 3.08 | 3.00 | 2.92 | 5.50 | 5.50 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 0.00 |
Front-end | 12.50 |
Dispatch | 14.67 |
Data deps. | 0.00 |
Overall L1 | 14.67 |
all | 23% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 50% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 42% |
all | 61% |
load | 61% |
store | 62% |
mul | 100% |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 66% |
all | 46% |
load | 42% |
store | 41% |
mul | 100% |
add-sub | 66% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 53% |
all | 11% |
load | 8% |
store | 6% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 15% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 15% |
all | 20% |
load | 20% |
store | 20% |
mul | 25% |
add-sub | 25% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 20% |
all | 16% |
load | 16% |
store | 15% |
mul | 25% |
add-sub | 18% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 17% |
Instruction | Nb FU | ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
INC %EDI | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV %EDX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
SAL $0x6,%EAX | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VMOVAPS -0x80(%RBP),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVAPS -0x70(%RBP),%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVSD -0x48(%RBP),%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
ADD $0x38,%R13 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV %EDI,(%RCX,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
ADD %EAX,%ESI | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV -0x2c(%RBP),%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV -0x40(%RBP),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV 0x8(%R12),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOVSXD %ESI,%RCX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV %R14D,(%RAX,%RCX,4) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV 0x10(%R12),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV %EDX,(%RAX,%RCX,4) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
SAL $0x3,%RCX | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
MOV 0x18(%R12),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
LEA (%RCX,%RCX,2),%RCX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
VMOVUPS %XMM0,(%RAX,%RCX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 4 | 1 |
VMOVAPS -0x60(%RBP),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVSD %XMM0,0x10(%RAX,%RCX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
MOV 0x20(%R12),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
VMOVUPS %XMM2,(%RAX,%RCX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 4 | 1 |
VMOVSD %XMM1,0x10(%RAX,%RCX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
CMP %R13,-0x38(%RBP) | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
JE 20c22c <unloadAtomsBuffer+0x1dc> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
MOV 0x18(%RDI),%R15 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
VMOVUPD 0x8(%RBX,%R13,1),%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVSD 0x30(%R15),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VUCOMISD %XMM2,%XMM0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0.50 | 6 | 1 |
JBE 20c170 <unloadAtomsBuffer+0x120> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
MOV (%R15),%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
VMOVUPS 0x20(%RBX,%R13,1),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVSD 0x18(%RBX,%R13,1),%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VMOVQ 0x4(%R15),%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VMOVAPD %XMM2,-0x80(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 4 | 1 |
MOV 0x4(%RBX,%R13,1),%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV 0x20(%RDI),%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV (%RBX,%R13,1),%R14D | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV %R15,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %EAX,-0x2c(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVAPS %XMM0,-0x70(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 4 | 1 |
VMOVSD 0x30(%RBX,%R13,1),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VSHUFPD $0x1,%XMM3,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 |
VCMPPD $0x1,0x38(%R15),%XMM2,%K2 | 2 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVAPD %XMM3,-0x60(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 4 | 1 |
VPCMPEQD %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.25 |
VMOVSD %XMM0,-0x48(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
VMOVDDUP 0x10(%RBX,%R13,1),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VUNPCKLPD %XMM3,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 |
VSUBPD 0x20(%R15),%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 0.50 |
VMULPD 0x68(%R15),%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VROUNDPD $0x9,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 0.50 |
VCVTTPD2DQ %XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 0.50 |
VPCMPEQD %XMM0,%XMM1,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VPADDD %XMM2,%XMM1,%XMM0{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.25 |
VMOVDQA32 %XMM0,%XMM1{%K2} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VMOVD %XMM1,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 5 | 1 |
VPEXTRD $0x1,%XMM1,%ECX | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 6 | 1 |
CALL 20f6b0 <getBoxFromTuple> | 2 | 0.50 | 0 | 0 | 0 | 0.50 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0x78(%R15),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV %EAX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV (%RCX,%RDX,4),%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV %ESI,%EDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
CMP 0xc(%R15),%EAX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
JGE 20c0c0 <unloadAtomsBuffer+0x70> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
INCL (%R12) | 2 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
MOV (%RCX,%RDX,4),%EDI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
JMP 20c0c0 <unloadAtomsBuffer+0x70> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
Function | unloadAtomsBuffer |
Source file and lines | linkCells.c:178-378 |
Module | exec |
nb instructions | 67 |
nb uops | 70 |
loop length | 312 |
used x86 registers | 11 |
used mmx registers | 0 |
used xmm registers | 4 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 7 |
ADD-SUB / MUL ratio | 1.00 |
micro-operation queue | 11.67 cycles |
front end | 11.67 cycles |
ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 2.50 | 2.50 | 2.50 | 2.50 | 2.00 | 14.00 | 14.00 | 14.00 | 3.00 | 3.08 | 3.00 | 2.92 | 5.50 | 5.50 |
cycles | 2.50 | 2.50 | 2.50 | 2.50 | 2.00 | 14.00 | 14.00 | 14.00 | 3.00 | 3.08 | 3.00 | 2.92 | 5.50 | 5.50 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 0.00 |
Front-end | 11.67 |
Dispatch | 14.00 |
Data deps. | 0.00 |
Overall L1 | 14.00 |
all | 25% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 50% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 50% |
all | 61% |
load | 61% |
store | 62% |
mul | 100% |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 66% |
all | 47% |
load | 44% |
store | 41% |
mul | 100% |
add-sub | 66% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 58% |
all | 11% |
load | 8% |
store | 6% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 15% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 16% |
all | 20% |
load | 20% |
store | 20% |
mul | 25% |
add-sub | 25% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 20% |
all | 16% |
load | 17% |
store | 15% |
mul | 25% |
add-sub | 18% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 18% |
Instruction | Nb FU | ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
INC %EDI | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV %EDX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
SAL $0x6,%EAX | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VMOVAPS -0x80(%RBP),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVAPS -0x70(%RBP),%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVSD -0x48(%RBP),%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
ADD $0x38,%R13 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV %EDI,(%RCX,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
ADD %EAX,%ESI | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV -0x2c(%RBP),%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV -0x40(%RBP),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV 0x8(%R12),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOVSXD %ESI,%RCX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV %R14D,(%RAX,%RCX,4) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV 0x10(%R12),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV %EDX,(%RAX,%RCX,4) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
SAL $0x3,%RCX | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
MOV 0x18(%R12),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
LEA (%RCX,%RCX,2),%RCX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
VMOVUPS %XMM0,(%RAX,%RCX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 4 | 1 |
VMOVAPS -0x60(%RBP),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVSD %XMM0,0x10(%RAX,%RCX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
MOV 0x20(%R12),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
VMOVUPS %XMM2,(%RAX,%RCX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 4 | 1 |
VMOVSD %XMM1,0x10(%RAX,%RCX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
CMP %R13,-0x38(%RBP) | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
JE 20c22c <unloadAtomsBuffer+0x1dc> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
MOV 0x18(%RDI),%R15 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
VMOVUPD 0x8(%RBX,%R13,1),%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVSD 0x30(%R15),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VUCOMISD %XMM2,%XMM0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0.50 | 6 | 1 |
JBE 20c170 <unloadAtomsBuffer+0x120> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
MOV (%R15),%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
VMOVUPS 0x20(%RBX,%R13,1),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVSD 0x18(%RBX,%R13,1),%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VMOVQ 0x4(%R15),%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VMOVAPD %XMM2,-0x80(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 4 | 1 |
MOV 0x4(%RBX,%R13,1),%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV 0x20(%RDI),%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV (%RBX,%R13,1),%R14D | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV %R15,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %EAX,-0x2c(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVAPS %XMM0,-0x70(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 4 | 1 |
VMOVSD 0x30(%RBX,%R13,1),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VSHUFPD $0x1,%XMM3,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 |
VCMPPD $0x1,0x38(%R15),%XMM2,%K2 | 2 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVAPD %XMM3,-0x60(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 4 | 1 |
VPCMPEQD %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.25 |
VMOVSD %XMM0,-0x48(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
VMOVDDUP 0x10(%RBX,%R13,1),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VUNPCKLPD %XMM3,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 |
VSUBPD 0x20(%R15),%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 0.50 |
VMULPD 0x68(%R15),%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VROUNDPD $0x9,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 0.50 |
VCVTTPD2DQ %XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 0.50 |
VPCMPEQD %XMM0,%XMM1,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VPADDD %XMM2,%XMM1,%XMM0{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.25 |
VMOVDQA32 %XMM0,%XMM1{%K2} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VMOVD %XMM1,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 5 | 1 |
VPEXTRD $0x1,%XMM1,%ECX | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 6 | 1 |
CALL 20f6b0 <getBoxFromTuple> | 2 | 0.50 | 0 | 0 | 0 | 0.50 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0x78(%R15),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV %EAX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV (%RCX,%RDX,4),%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV %ESI,%EDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
CMP 0xc(%R15),%EAX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
JGE 20c0c0 <unloadAtomsBuffer+0x70> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |