Loop Id: 3 | Module: exec | Source: Step10_orig.c:19-31 | Coverage: 41.95% |
---|
Loop Id: 3 | Module: exec | Source: Step10_orig.c:19-31 | Coverage: 41.95% |
---|
0x4016f0 VMOVUPS (%RDX,%R11,1),%ZMM0 [1] |
0x4016f7 VMOVUPS (%RSI,%R11,1),%ZMM7 [3] |
0x4016fe VMOVUPS (%RCX,%R11,1),%ZMM5 [2] |
0x401705 VSUBPS %ZMM26,%ZMM0,%ZMM8 |
0x40170b VSUBPS %ZMM27,%ZMM7,%ZMM9 |
0x401711 VSUBPS %ZMM25,%ZMM5,%ZMM7 |
0x401717 VMULPS %ZMM8,%ZMM8,%ZMM29 |
0x40171d VFMADD231PS %ZMM9,%ZMM9,%ZMM29 |
0x401723 VFMADD231PS %ZMM7,%ZMM7,%ZMM29 |
0x401729 VADDPS %ZMM23,%ZMM29,%ZMM0 |
0x40172f VCMPPS $0xe,%ZMM16,%ZMM29,%K2 |
0x401736 VCMPPS $0x1,%ZMM24,%ZMM29,%K1 |
0x40173d VMOVUPS (%R8,%R11,1),%ZMM13{%K1} [4] |
0x401744 ADD $0x40,%R11 |
0x401748 VCVTPS2PD %YMM0,%ZMM6 |
0x40174e VEXTRACTF32X8 $0x1,%ZMM0,%YMM5 |
0x401755 VMOVAPS %ZMM29,%ZMM0 |
0x40175b VFMADD132PS %ZMM22,%ZMM21,%ZMM0 |
0x401761 VCVTPS2PD %YMM5,%ZMM5 |
0x401767 VMOVAPS %ZMM13,%ZMM28{%K1}{z} |
0x40176d VFMADD132PS %ZMM29,%ZMM20,%ZMM0 |
0x401773 VSQRTPD %ZMM6,%ZMM30 |
0x401779 VMULPD %ZMM30,%ZMM6,%ZMM6 |
0x40177f VSQRTPD %ZMM5,%ZMM31 |
0x401785 VMULPD %ZMM31,%ZMM5,%ZMM5 |
0x40178b VFMADD132PS %ZMM29,%ZMM19,%ZMM0 |
0x401791 VDIVPD %ZMM6,%ZMM15,%ZMM6 |
0x401797 VFMADD132PS %ZMM29,%ZMM18,%ZMM0 |
0x40179d VFMADD132PS %ZMM29,%ZMM17,%ZMM0 |
0x4017a3 VDIVPD %ZMM5,%ZMM15,%ZMM5 |
0x4017a9 VCVTPS2PD %YMM0,%ZMM29 |
0x4017af VEXTRACTF32X8 $0x1,%ZMM0,%YMM0 |
0x4017b6 VCVTPS2PD %YMM0,%ZMM0 |
0x4017bc VADDPD %ZMM29,%ZMM6,%ZMM6 |
0x4017c2 VCVTPD2PS %ZMM6,%YMM6 |
0x4017c8 VADDPD %ZMM0,%ZMM5,%ZMM5 |
0x4017ce VCVTPD2PS %ZMM5,%YMM0 |
0x4017d4 VINSERTF64X4 $0x1,%YMM0,%ZMM6,%ZMM5 |
0x4017db VMULPS %ZMM28,%ZMM5,%ZMM6 |
0x4017e1 VMULPS %ZMM6,%ZMM9,%ZMM0{%K2}{z} |
0x4017e7 VMULPS %ZMM6,%ZMM8,%ZMM9{%K2}{z} |
0x4017ed VMULPS %ZMM6,%ZMM7,%ZMM8{%K2}{z} |
0x4017f3 VADDPS %ZMM0,%ZMM10,%ZMM10 |
0x4017f9 VADDPS %ZMM9,%ZMM11,%ZMM11 |
0x4017ff VADDPS %ZMM8,%ZMM12,%ZMM12 |
0x401805 CMP %R11,%RBX |
0x401808 JNE 4016f0 |
/beegfs/hackathon/users/eoseret/qaas_runs/171-094-7986/intel/HACCmk/build/HACCmk/src/Step10_orig.c: 19 - 31 |
-------------------------------------------------------------------------------- |
19: for ( j = 0; j < count1; j++ ) |
20: { |
21: dxc = xx1[j] - xxi; |
22: dyc = yy1[j] - yyi; |
23: dzc = zz1[j] - zzi; |
24: |
25: r2 = dxc * dxc + dyc * dyc + dzc * dzc; |
26: |
27: m = ( r2 < fsrrmax2 ) ? mass1[j] : 0.0f; |
28: |
29: f = pow( r2 + mp_rsm2, -1.5 ) - ( ma0 + r2*(ma1 + r2*(ma2 + r2*(ma3 + r2*(ma4 + r2*ma5))))); |
30: |
31: f = ( r2 > 0.0f ) ? m * f : 0.0f; |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 1.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 2.38 |
Bottlenecks | P8, P9, |
Function | Step10_orig |
Source | Step10_orig.c:19-31 |
Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 50.00 |
CQA cycles if no scalar integer | 50.00 |
CQA cycles if FP arith vectorized | 50.00 |
CQA cycles if fully vectorized | 50.00 |
Front-end cycles | 8.67 |
DIV/SQRT cycles | 0.50 |
P0 cycles | 0.50 |
P1 cycles | 0.25 |
P2 cycles | 0.25 |
P3 cycles | 0.50 |
P4 cycles | 2.67 |
P5 cycles | 2.67 |
P6 cycles | 2.67 |
P7 cycles | 21.00 |
P8 cycles | 19.00 |
P9 cycles | 15.50 |
P10 cycles | 15.50 |
P11 cycles | 0.00 |
P12 cycles | 0.00 |
P13 cycles | 50.00 |
Inter-iter dependencies cycles | 3 |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 47.00 |
Nb uops | 52.00 |
Nb loads | 4.00 |
Nb stores | 0.00 |
Nb stack references | 0.00 |
FLOP/cycle | 9.60 |
Nb FLOP add-sub | 128.00 |
Nb FLOP mul | 96.00 |
Nb FLOP fma | 112.00 |
Nb FLOP div | 16.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 16.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 5.12 |
Bytes prefetched | 0.00 |
Bytes loaded | 256.00 |
Bytes stored | 0.00 |
Stride 0 | 0.00 |
Stride 1 | 4.00 |
Stride n | 0.00 |
Stride unknown | 0.00 |
Stride indirect | 0.00 |
Vectorization ratio all | 100.00 |
Vectorization ratio load | 100.00 |
Vectorization ratio store | NA |
Vectorization ratio mul | 100.00 |
Vectorization ratio add_sub | 100.00 |
Vectorization ratio fma | 100.00 |
Vectorization ratio div_sqrt | 100.00 |
Vectorization ratio other | 100.00 |
Vector-efficiency ratio all | 92.05 |
Vector-efficiency ratio load | 100.00 |
Vector-efficiency ratio store | NA |
Vector-efficiency ratio mul | 100.00 |
Vector-efficiency ratio add_sub | 100.00 |
Vector-efficiency ratio fma | 100.00 |
Vector-efficiency ratio div_sqrt | 100.00 |
Vector-efficiency ratio other | 73.08 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 1.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 2.38 |
Bottlenecks | P8, P9, |
Function | Step10_orig |
Source | Step10_orig.c:19-31 |
Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 50.00 |
CQA cycles if no scalar integer | 50.00 |
CQA cycles if FP arith vectorized | 50.00 |
CQA cycles if fully vectorized | 50.00 |
Front-end cycles | 8.67 |
DIV/SQRT cycles | 0.50 |
P0 cycles | 0.50 |
P1 cycles | 0.25 |
P2 cycles | 0.25 |
P3 cycles | 0.50 |
P4 cycles | 2.67 |
P5 cycles | 2.67 |
P6 cycles | 2.67 |
P7 cycles | 21.00 |
P8 cycles | 19.00 |
P9 cycles | 15.50 |
P10 cycles | 15.50 |
P11 cycles | 0.00 |
P12 cycles | 0.00 |
P13 cycles | 50.00 |
Inter-iter dependencies cycles | 3 |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 47.00 |
Nb uops | 52.00 |
Nb loads | 4.00 |
Nb stores | 0.00 |
Nb stack references | 0.00 |
FLOP/cycle | 9.60 |
Nb FLOP add-sub | 128.00 |
Nb FLOP mul | 96.00 |
Nb FLOP fma | 112.00 |
Nb FLOP div | 16.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 16.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 5.12 |
Bytes prefetched | 0.00 |
Bytes loaded | 256.00 |
Bytes stored | 0.00 |
Stride 0 | 0.00 |
Stride 1 | 4.00 |
Stride n | 0.00 |
Stride unknown | 0.00 |
Stride indirect | 0.00 |
Vectorization ratio all | 100.00 |
Vectorization ratio load | 100.00 |
Vectorization ratio store | NA |
Vectorization ratio mul | 100.00 |
Vectorization ratio add_sub | 100.00 |
Vectorization ratio fma | 100.00 |
Vectorization ratio div_sqrt | 100.00 |
Vectorization ratio other | 100.00 |
Vector-efficiency ratio all | 92.05 |
Vector-efficiency ratio load | 100.00 |
Vector-efficiency ratio store | NA |
Vector-efficiency ratio mul | 100.00 |
Vector-efficiency ratio add_sub | 100.00 |
Vector-efficiency ratio fma | 100.00 |
Vector-efficiency ratio div_sqrt | 100.00 |
Vector-efficiency ratio other | 73.08 |
Path / |
Function | Step10_orig |
Source file and lines | Step10_orig.c:19-31 |
Module | exec |
nb instructions | 47 |
nb uops | 52 |
loop length | 286 |
used x86 registers | 6 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 3 |
used zmm registers | 27 |
nb stack references | 0 |
ADD-SUB / MUL ratio | 1.29 |
micro-operation queue | 8.67 cycles |
front end | 8.67 cycles |
ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 0.50 | 0.50 | 0.25 | 0.25 | 0.50 | 1.33 | 1.33 | 1.33 | 11.00 | 11.00 | 11.00 | 11.00 | 0.00 | 0.00 |
cycles | 0.50 | 0.50 | 0.25 | 0.25 | 0.50 | 2.67 | 2.67 | 2.67 | 21.00 | 19.00 | 15.50 | 15.50 | 0.00 | 0.00 |
Cycles executing div or sqrt instructions | 50.00 |
Longest recurrence chain latency (RecMII) | 3.00 |
Front-end | 8.67 |
Dispatch | 21.00 |
DIV/SQRT | 50.00 |
Data deps. | 3.00 |
Overall L1 | 50.00 |
all | 100% |
load | 100% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 100% |
all | 92% |
load | 100% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 73% |
Instruction | Nb FU | ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
VMOVUPS (%RDX,%R11,1),%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVUPS (%RSI,%R11,1),%ZMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVUPS (%RCX,%R11,1),%ZMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VSUBPS %ZMM26,%ZMM0,%ZMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 1 |
VSUBPS %ZMM27,%ZMM7,%ZMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 1 |
VSUBPS %ZMM25,%ZMM5,%ZMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 1 |
VMULPS %ZMM8,%ZMM8,%ZMM29 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
VFMADD231PS %ZMM9,%ZMM9,%ZMM29 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 1 |
VFMADD231PS %ZMM7,%ZMM7,%ZMM29 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 1 |
VADDPS %ZMM23,%ZMM29,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 1 |
VCMPPS $0xe,%ZMM16,%ZMM29,%K2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 1 |
VCMPPS $0x1,%ZMM24,%ZMM29,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 1 |
VMOVUPS (%R8,%R11,1),%ZMM13{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD $0x40,%R11 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
VCVTPS2PD %YMM0,%ZMM6 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 1 | 0.50 | 0 | 0 | 6 | 1.33 |
VEXTRACTF32X8 $0x1,%ZMM0,%YMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.50 |
VMOVAPS %ZMM29,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VFMADD132PS %ZMM22,%ZMM21,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 1 |
VCVTPS2PD %YMM5,%ZMM5 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 1 | 0.50 | 0 | 0 | 6 | 1.33 |
VMOVAPS %ZMM13,%ZMM28{%K1}{z} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VFMADD132PS %ZMM29,%ZMM20,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 1 |
VSQRTPD %ZMM6,%ZMM30 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 21 | 16 |
VMULPD %ZMM30,%ZMM6,%ZMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
VSQRTPD %ZMM5,%ZMM31 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 21 | 16 |
VMULPD %ZMM31,%ZMM5,%ZMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
VFMADD132PS %ZMM29,%ZMM19,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 1 |
VDIVPD %ZMM6,%ZMM15,%ZMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 13 | 9 |
VFMADD132PS %ZMM29,%ZMM18,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 1 |
VFMADD132PS %ZMM29,%ZMM17,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 1 |
VDIVPD %ZMM5,%ZMM15,%ZMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 13 | 9 |
VCVTPS2PD %YMM0,%ZMM29 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 1 | 0.50 | 0 | 0 | 6 | 1.33 |
VEXTRACTF32X8 $0x1,%ZMM0,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.50 |
VCVTPS2PD %YMM0,%ZMM0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 1 | 0.50 | 0 | 0 | 6 | 1.33 |
VADDPD %ZMM29,%ZMM6,%ZMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 1 |
VCVTPD2PS %ZMM6,%YMM6 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 1 | 0.50 | 0 | 0 | 7 | 1.33 |
VADDPD %ZMM0,%ZMM5,%ZMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 1 |
VCVTPD2PS %ZMM5,%YMM0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 1 | 0.50 | 0 | 0 | 7 | 1.33 |
VINSERTF64X4 $0x1,%YMM0,%ZMM6,%ZMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 1 |
VMULPS %ZMM28,%ZMM5,%ZMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
VMULPS %ZMM6,%ZMM9,%ZMM0{%K2}{z} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
VMULPS %ZMM6,%ZMM8,%ZMM9{%K2}{z} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
VMULPS %ZMM6,%ZMM7,%ZMM8{%K2}{z} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
VADDPS %ZMM0,%ZMM10,%ZMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 1 |
VADDPS %ZMM9,%ZMM11,%ZMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 1 |
VADDPS %ZMM8,%ZMM12,%ZMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 1 |
CMP %R11,%RBX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
JNE 4016f0 <Step10_orig+0xb0> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
Function | Step10_orig |
Source file and lines | Step10_orig.c:19-31 |
Module | exec |
nb instructions | 47 |
nb uops | 52 |
loop length | 286 |
used x86 registers | 6 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 3 |
used zmm registers | 27 |
nb stack references | 0 |
ADD-SUB / MUL ratio | 1.29 |
micro-operation queue | 8.67 cycles |
front end | 8.67 cycles |
ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 0.50 | 0.50 | 0.25 | 0.25 | 0.50 | 1.33 | 1.33 | 1.33 | 11.00 | 11.00 | 11.00 | 11.00 | 0.00 | 0.00 |
cycles | 0.50 | 0.50 | 0.25 | 0.25 | 0.50 | 2.67 | 2.67 | 2.67 | 21.00 | 19.00 | 15.50 | 15.50 | 0.00 | 0.00 |
Cycles executing div or sqrt instructions | 50.00 |
Longest recurrence chain latency (RecMII) | 3.00 |
Front-end | 8.67 |
Dispatch | 21.00 |
DIV/SQRT | 50.00 |
Data deps. | 3.00 |
Overall L1 | 50.00 |
all | 100% |
load | 100% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 100% |
all | 92% |
load | 100% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 73% |
Instruction | Nb FU | ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
VMOVUPS (%RDX,%R11,1),%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVUPS (%RSI,%R11,1),%ZMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVUPS (%RCX,%R11,1),%ZMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VSUBPS %ZMM26,%ZMM0,%ZMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 1 |
VSUBPS %ZMM27,%ZMM7,%ZMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 1 |
VSUBPS %ZMM25,%ZMM5,%ZMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 1 |
VMULPS %ZMM8,%ZMM8,%ZMM29 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
VFMADD231PS %ZMM9,%ZMM9,%ZMM29 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 1 |
VFMADD231PS %ZMM7,%ZMM7,%ZMM29 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 1 |
VADDPS %ZMM23,%ZMM29,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 1 |
VCMPPS $0xe,%ZMM16,%ZMM29,%K2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 1 |
VCMPPS $0x1,%ZMM24,%ZMM29,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 1 |
VMOVUPS (%R8,%R11,1),%ZMM13{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD $0x40,%R11 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
VCVTPS2PD %YMM0,%ZMM6 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 1 | 0.50 | 0 | 0 | 6 | 1.33 |
VEXTRACTF32X8 $0x1,%ZMM0,%YMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.50 |
VMOVAPS %ZMM29,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VFMADD132PS %ZMM22,%ZMM21,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 1 |
VCVTPS2PD %YMM5,%ZMM5 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 1 | 0.50 | 0 | 0 | 6 | 1.33 |
VMOVAPS %ZMM13,%ZMM28{%K1}{z} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VFMADD132PS %ZMM29,%ZMM20,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 1 |
VSQRTPD %ZMM6,%ZMM30 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 21 | 16 |
VMULPD %ZMM30,%ZMM6,%ZMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
VSQRTPD %ZMM5,%ZMM31 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 21 | 16 |
VMULPD %ZMM31,%ZMM5,%ZMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
VFMADD132PS %ZMM29,%ZMM19,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 1 |
VDIVPD %ZMM6,%ZMM15,%ZMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 13 | 9 |
VFMADD132PS %ZMM29,%ZMM18,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 1 |
VFMADD132PS %ZMM29,%ZMM17,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 1 |
VDIVPD %ZMM5,%ZMM15,%ZMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 13 | 9 |
VCVTPS2PD %YMM0,%ZMM29 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 1 | 0.50 | 0 | 0 | 6 | 1.33 |
VEXTRACTF32X8 $0x1,%ZMM0,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.50 |
VCVTPS2PD %YMM0,%ZMM0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 1 | 0.50 | 0 | 0 | 6 | 1.33 |
VADDPD %ZMM29,%ZMM6,%ZMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 1 |
VCVTPD2PS %ZMM6,%YMM6 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 1 | 0.50 | 0 | 0 | 7 | 1.33 |
VADDPD %ZMM0,%ZMM5,%ZMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 1 |
VCVTPD2PS %ZMM5,%YMM0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 1 | 0.50 | 0 | 0 | 7 | 1.33 |
VINSERTF64X4 $0x1,%YMM0,%ZMM6,%ZMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 1 |
VMULPS %ZMM28,%ZMM5,%ZMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
VMULPS %ZMM6,%ZMM9,%ZMM0{%K2}{z} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
VMULPS %ZMM6,%ZMM8,%ZMM9{%K2}{z} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
VMULPS %ZMM6,%ZMM7,%ZMM8{%K2}{z} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
VADDPS %ZMM0,%ZMM10,%ZMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 1 |
VADDPS %ZMM9,%ZMM11,%ZMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 1 |
VADDPS %ZMM8,%ZMM12,%ZMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 1 |
CMP %R11,%RBX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
JNE 4016f0 <Step10_orig+0xb0> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |