Loop Id: 3562 | Module: exec | Source: par_csr_matop.c:109-231 [...] | Coverage: 0.6% |
---|
Loop Id: 3562 | Module: exec | Source: par_csr_matop.c:109-231 [...] | Coverage: 0.6% |
---|
0x4bc640 MOV -0x30(%RBP),%R11 |
0x4bc644 MOV -0x70(%RBP),%RDI |
0x4bc648 LEA 0x1(%RDI),%RAX |
0x4bc64c MOV -0x38(%RBP),%RCX |
0x4bc650 CMP %RCX,%RDI |
0x4bc653 JE 4bc1d0 |
0x4bc659 MOV %RAX,-0x70(%RBP) |
0x4bc65d ADD %R11,%RAX |
0x4bc660 MOV -0x88(%RBP),%RCX |
0x4bc667 MOV (%RCX,%RAX,8),%RDI |
0x4bc66b MOV 0x20(%RBP),%RCX |
0x4bc66f MOV (%RCX,%RDI,8),%RAX |
0x4bc673 MOV 0x8(%RCX,%RDI,8),%R13 |
0x4bc678 MOV %R13,%R9 |
0x4bc67b SUB %RAX,%R9 |
0x4bc67e JLE 4bc7e4 |
0x4bc684 CMP $0x8,%R9 |
0x4bc688 JAE 4bc6d0 |
0x4bc68a MOV %R9,%RCX |
0x4bc68d AND $-0x8,%RCX |
0x4bc691 CMP %R9,%RCX |
0x4bc694 JAE 4bc7e0 |
0x4bc69a ADD %RCX,%RAX |
0x4bc69d MOV 0x28(%RBP),%R9 |
0x4bc6a1 MOV -0x30(%RBP),%R11 |
0x4bc6a5 JMP 4bc6bc |
(3565) 0x4bc6b0 INC %RAX |
(3565) 0x4bc6b3 CMP %RAX,%R13 |
(3565) 0x4bc6b6 JE 4bc7e4 |
(3565) 0x4bc6bc MOV (%R9,%RAX,8),%RCX |
(3565) 0x4bc6c0 CMP %R8,(%R14,%RCX,8) |
(3565) 0x4bc6c4 JGE 4bc6b0 |
(3565) 0x4bc6c6 MOV %RBX,(%R14,%RCX,8) |
(3565) 0x4bc6ca INC %RBX |
(3565) 0x4bc6cd JMP 4bc6b0 |
0x4bc6d0 MOV %R9,%RCX |
0x4bc6d3 SHR $0x3,%RCX |
0x4bc6d7 MOV -0x80(%RBP),%R11 |
0x4bc6db LEA (%R11,%RAX,8),%R11 |
0x4bc6df JMP 4bc6f9 |
(3566) 0x4bc6f0 ADD $0x40,%R11 |
(3566) 0x4bc6f4 DEC %RCX |
(3566) 0x4bc6f7 JE 4bc68a |
(3566) 0x4bc6f9 MOV -0x38(%R11),%R12 |
(3566) 0x4bc6fd CMP %R8,(%R14,%R12,8) |
(3566) 0x4bc701 JGE 4bc770 |
(3566) 0x4bc703 MOV %RBX,(%R14,%R12,8) |
(3566) 0x4bc707 INC %RBX |
(3566) 0x4bc70a MOV -0x30(%R11),%R12 |
(3566) 0x4bc70e CMP %R8,(%R14,%R12,8) |
(3566) 0x4bc712 JL 4bc77a |
(3566) 0x4bc714 MOV -0x28(%R11),%R12 |
(3566) 0x4bc718 CMP %R8,(%R14,%R12,8) |
(3566) 0x4bc71c JGE 4bc78b |
(3566) 0x4bc71e MOV %RBX,(%R14,%R12,8) |
(3566) 0x4bc722 INC %RBX |
(3566) 0x4bc725 MOV -0x20(%R11),%R12 |
(3566) 0x4bc729 CMP %R8,(%R14,%R12,8) |
(3566) 0x4bc72d JL 4bc795 |
(3566) 0x4bc72f MOV -0x18(%R11),%R12 |
(3566) 0x4bc733 CMP %R8,(%R14,%R12,8) |
(3566) 0x4bc737 JGE 4bc7a6 |
(3566) 0x4bc739 MOV %RBX,(%R14,%R12,8) |
(3566) 0x4bc73d INC %RBX |
(3566) 0x4bc740 MOV -0x10(%R11),%R12 |
(3566) 0x4bc744 CMP %R8,(%R14,%R12,8) |
(3566) 0x4bc748 JL 4bc7b0 |
(3566) 0x4bc74a MOV -0x8(%R11),%R12 |
(3566) 0x4bc74e CMP %R8,(%R14,%R12,8) |
(3566) 0x4bc752 JGE 4bc7c1 |
(3566) 0x4bc754 MOV %RBX,(%R14,%R12,8) |
(3566) 0x4bc758 INC %RBX |
(3566) 0x4bc75b MOV (%R11),%R12 |
(3566) 0x4bc75e CMP %R8,(%R14,%R12,8) |
(3566) 0x4bc762 JGE 4bc6f0 |
(3566) 0x4bc764 JMP 4bc7ce |
(3566) 0x4bc770 MOV -0x30(%R11),%R12 |
(3566) 0x4bc774 CMP %R8,(%R14,%R12,8) |
(3566) 0x4bc778 JGE 4bc714 |
(3566) 0x4bc77a MOV %RBX,(%R14,%R12,8) |
(3566) 0x4bc77e INC %RBX |
(3566) 0x4bc781 MOV -0x28(%R11),%R12 |
(3566) 0x4bc785 CMP %R8,(%R14,%R12,8) |
(3566) 0x4bc789 JL 4bc71e |
(3566) 0x4bc78b MOV -0x20(%R11),%R12 |
(3566) 0x4bc78f CMP %R8,(%R14,%R12,8) |
(3566) 0x4bc793 JGE 4bc72f |
(3566) 0x4bc795 MOV %RBX,(%R14,%R12,8) |
(3566) 0x4bc799 INC %RBX |
(3566) 0x4bc79c MOV -0x18(%R11),%R12 |
(3566) 0x4bc7a0 CMP %R8,(%R14,%R12,8) |
(3566) 0x4bc7a4 JL 4bc739 |
(3566) 0x4bc7a6 MOV -0x10(%R11),%R12 |
(3566) 0x4bc7aa CMP %R8,(%R14,%R12,8) |
(3566) 0x4bc7ae JGE 4bc74a |
(3566) 0x4bc7b0 MOV %RBX,(%R14,%R12,8) |
(3566) 0x4bc7b4 INC %RBX |
(3566) 0x4bc7b7 MOV -0x8(%R11),%R12 |
(3566) 0x4bc7bb CMP %R8,(%R14,%R12,8) |
(3566) 0x4bc7bf JL 4bc754 |
(3566) 0x4bc7c1 MOV (%R11),%R12 |
(3566) 0x4bc7c4 CMP %R8,(%R14,%R12,8) |
(3566) 0x4bc7c8 JGE 4bc6f0 |
(3566) 0x4bc7ce MOV %RBX,(%R14,%R12,8) |
(3566) 0x4bc7d2 INC %RBX |
(3566) 0x4bc7d5 JMP 4bc6f0 |
0x4bc7e0 MOV -0x30(%RBP),%R11 |
0x4bc7e4 MOV 0x30(%RBP),%RCX |
0x4bc7e8 MOV (%RCX,%RDI,8),%RAX |
0x4bc7ec MOV 0x8(%RCX,%RDI,8),%RCX |
0x4bc7f1 MOV %RCX,%RDI |
0x4bc7f4 SUB %RAX,%RDI |
0x4bc7f7 JLE 4bc644 |
0x4bc7fd CMP $0x4,%RDI |
0x4bc801 JAE 4bc850 |
0x4bc803 MOV %RDI,%R9 |
0x4bc806 AND $-0x4,%R9 |
0x4bc80a CMP %RDI,%R9 |
0x4bc80d JAE 4bc640 |
0x4bc813 ADD %R9,%RAX |
0x4bc816 MOV 0x38(%RBP),%R9 |
0x4bc81a MOV -0x30(%RBP),%R11 |
0x4bc81e JMP 4bc82c |
(3563) 0x4bc820 INC %RAX |
(3563) 0x4bc823 CMP %RAX,%RCX |
(3563) 0x4bc826 JE 4bc644 |
(3563) 0x4bc82c MOV (%R9,%RAX,8),%RDI |
(3563) 0x4bc830 MOV (%RSI,%RDI,8),%RDI |
(3563) 0x4bc834 ADD %R15,%RDI |
(3563) 0x4bc837 CMP %R10,(%R14,%RDI,8) |
(3563) 0x4bc83b JGE 4bc820 |
(3563) 0x4bc83d MOV %RDX,(%R14,%RDI,8) |
(3563) 0x4bc841 INC %RDX |
(3563) 0x4bc844 JMP 4bc820 |
0x4bc850 MOV %RDI,%R9 |
0x4bc853 SHR $0x2,%R9 |
0x4bc857 MOV -0xc8(%RBP),%R11 |
0x4bc85e LEA (%R11,%RAX,8),%R11 |
0x4bc862 JMP 4bc879 |
(3564) 0x4bc870 ADD $0x20,%R11 |
(3564) 0x4bc874 DEC %R9 |
(3564) 0x4bc877 JE 4bc803 |
(3564) 0x4bc879 MOV -0x18(%R11),%R12 |
(3564) 0x4bc87d MOV (%RSI,%R12,8),%R13 |
(3564) 0x4bc881 ADD %R15,%R13 |
(3564) 0x4bc884 CMP %R10,(%R14,%R13,8) |
(3564) 0x4bc888 JGE 4bc891 |
(3564) 0x4bc88a MOV %RDX,(%R14,%R13,8) |
(3564) 0x4bc88e INC %RDX |
(3564) 0x4bc891 MOV -0x10(%R11),%R12 |
(3564) 0x4bc895 MOV (%RSI,%R12,8),%R13 |
(3564) 0x4bc899 ADD %R15,%R13 |
(3564) 0x4bc89c CMP %R10,(%R14,%R13,8) |
(3564) 0x4bc8a0 JGE 4bc8a9 |
(3564) 0x4bc8a2 MOV %RDX,(%R14,%R13,8) |
(3564) 0x4bc8a6 INC %RDX |
(3564) 0x4bc8a9 MOV -0x8(%R11),%R12 |
(3564) 0x4bc8ad MOV (%RSI,%R12,8),%R13 |
(3564) 0x4bc8b1 ADD %R15,%R13 |
(3564) 0x4bc8b4 CMP %R10,(%R14,%R13,8) |
(3564) 0x4bc8b8 JGE 4bc8c1 |
(3564) 0x4bc8ba MOV %RDX,(%R14,%R13,8) |
(3564) 0x4bc8be INC %RDX |
(3564) 0x4bc8c1 MOV (%R11),%R12 |
(3564) 0x4bc8c4 MOV (%RSI,%R12,8),%R13 |
(3564) 0x4bc8c8 ADD %R15,%R13 |
(3564) 0x4bc8cb CMP %R10,(%R14,%R13,8) |
(3564) 0x4bc8cf JGE 4bc870 |
(3564) 0x4bc8d1 MOV %RDX,(%R14,%R13,8) |
(3564) 0x4bc8d5 INC %RDX |
(3564) 0x4bc8d8 JMP 4bc870 |
/scratch_na/users/xoserete/qaas_runs/171-172-8218/intel/AMG/build/AMG/AMG/parcsr_mv/par_csr_matop.c: 109 - 231 |
-------------------------------------------------------------------------------- |
109: if (ii < rest) |
[...] |
187: for (jj2 = A_diag_i[i1]; jj2 < A_diag_i[i1+1]; jj2++) |
188: { |
189: i2 = A_diag_j[jj2]; |
[...] |
195: for (jj3 = B_diag_i[i2]; jj3 < B_diag_i[i2+1]; jj3++) |
196: { |
197: i3 = B_diag_j[jj3]; |
[...] |
205: if (B_marker[i3] < jj_row_begin_diag) |
206: { |
207: B_marker[i3] = jj_count_diag; |
208: jj_count_diag++; |
[...] |
216: if (num_cols_offd_B) |
217: { |
218: for (jj3 = B_offd_i[i2]; jj3 < B_offd_i[i2+1]; jj3++) |
219: { |
220: i3 = num_cols_diag_B+map_B_to_C[B_offd_j[jj3]]; |
[...] |
228: if (B_marker[i3] < jj_row_begin_offd) |
229: { |
230: B_marker[i3] = jj_count_offd; |
231: jj_count_offd++; |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 8.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.47 |
Bottlenecks | micro-operation queue, |
Function | hypre_ParMatmul_RowSizes.extracted |
Source | par_csr_matop.c:109-109,par_csr_matop.c:187-189,par_csr_matop.c:195-195,par_csr_matop.c:208-208,par_csr_matop.c:216-218 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 8.83 |
CQA cycles if no scalar integer | 8.83 |
CQA cycles if FP arith vectorized | 8.83 |
CQA cycles if fully vectorized | 1.10 |
Front-end cycles | 8.83 |
DIV/SQRT cycles | 4.50 |
P0 cycles | 4.00 |
P1 cycles | 6.00 |
P2 cycles | 6.00 |
P3 cycles | 0.50 |
P4 cycles | 4.00 |
P5 cycles | 4.50 |
P6 cycles | 0.50 |
P7 cycles | 0.50 |
P8 cycles | 0.50 |
P9 cycles | 4.00 |
P10 cycles | 6.00 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | 16.48 - 17.61 |
Stall cycles (UFS) | 7.21 - 8.32 |
Nb insns | 53.00 |
Nb uops | 53.00 |
Nb loads | 18.00 |
Nb stores | 1.00 |
Nb stack references | 10.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 17.21 |
Bytes prefetched | 0.00 |
Bytes loaded | 144.00 |
Bytes stored | 8.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 0.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | NA |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 12.50 |
Vector-efficiency ratio load | 12.50 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | NA |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 12.50 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 8.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.47 |
Bottlenecks | micro-operation queue, |
Function | hypre_ParMatmul_RowSizes.extracted |
Source | par_csr_matop.c:109-109,par_csr_matop.c:187-189,par_csr_matop.c:195-195,par_csr_matop.c:208-208,par_csr_matop.c:216-218 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 8.83 |
CQA cycles if no scalar integer | 8.83 |
CQA cycles if FP arith vectorized | 8.83 |
CQA cycles if fully vectorized | 1.10 |
Front-end cycles | 8.83 |
DIV/SQRT cycles | 4.50 |
P0 cycles | 4.00 |
P1 cycles | 6.00 |
P2 cycles | 6.00 |
P3 cycles | 0.50 |
P4 cycles | 4.00 |
P5 cycles | 4.50 |
P6 cycles | 0.50 |
P7 cycles | 0.50 |
P8 cycles | 0.50 |
P9 cycles | 4.00 |
P10 cycles | 6.00 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | 16.48 - 17.61 |
Stall cycles (UFS) | 7.21 - 8.32 |
Nb insns | 53.00 |
Nb uops | 53.00 |
Nb loads | 18.00 |
Nb stores | 1.00 |
Nb stack references | 10.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 17.21 |
Bytes prefetched | 0.00 |
Bytes loaded | 144.00 |
Bytes stored | 8.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 0.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | NA |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 12.50 |
Vector-efficiency ratio load | 12.50 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | NA |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 12.50 |
Path / |
Function | hypre_ParMatmul_RowSizes.extracted |
Source file and lines | par_csr_matop.c:109-231 |
Module | exec |
nb instructions | 53 |
nb uops | 53 |
loop length | 204 |
used x86 registers | 7 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 10 |
micro-operation queue | 8.83 cycles |
front end | 8.83 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 4.50 | 4.00 | 6.00 | 6.00 | 0.50 | 4.00 | 4.50 | 0.50 | 0.50 | 0.50 | 4.00 | 6.00 |
cycles | 4.50 | 4.00 | 6.00 | 6.00 | 0.50 | 4.00 | 4.50 | 0.50 | 0.50 | 0.50 | 4.00 | 6.00 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 16.48-17.61 |
Stall cycles | 7.21-8.32 |
LM full (events) | 9.79-11.37 |
Front-end | 8.83 |
Dispatch | 6.00 |
Overall L1 | 8.83 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 12% |
load | 12% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MOV -0x30(%RBP),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x70(%RBP),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA 0x1(%RDI),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV -0x38(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
CMP %RCX,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JE 4bc1d0 <hypre_ParMatmul_RowSizes.extracted+0x270> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV %RAX,-0x70(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
ADD %R11,%RAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV -0x88(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX,%RAX,8),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x20(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX,%RDI,8),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x8(%RCX,%RDI,8),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %R13,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SUB %RAX,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JLE 4bc7e4 <hypre_ParMatmul_RowSizes.extracted+0x884> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
CMP $0x8,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JAE 4bc6d0 <hypre_ParMatmul_RowSizes.extracted+0x770> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV %R9,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
AND $-0x8,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
CMP %R9,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JAE 4bc7e0 <hypre_ParMatmul_RowSizes.extracted+0x880> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
ADD %RCX,%RAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV 0x28(%RBP),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x30(%RBP),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
JMP 4bc6bc <hypre_ParMatmul_RowSizes.extracted+0x75c> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
MOV %R9,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SHR $0x3,%RCX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
MOV -0x80(%RBP),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%R11,%RAX,8),%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 4bc6f9 <hypre_ParMatmul_RowSizes.extracted+0x799> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
MOV -0x30(%RBP),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x30(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX,%RDI,8),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x8(%RCX,%RDI,8),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RCX,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SUB %RAX,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JLE 4bc644 <hypre_ParMatmul_RowSizes.extracted+0x6e4> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
CMP $0x4,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JAE 4bc850 <hypre_ParMatmul_RowSizes.extracted+0x8f0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV %RDI,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
AND $-0x4,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
CMP %RDI,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JAE 4bc640 <hypre_ParMatmul_RowSizes.extracted+0x6e0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
ADD %R9,%RAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV 0x38(%RBP),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x30(%RBP),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
JMP 4bc82c <hypre_ParMatmul_RowSizes.extracted+0x8cc> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
MOV %RDI,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SHR $0x2,%R9 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
MOV -0xc8(%RBP),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%R11,%RAX,8),%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 4bc879 <hypre_ParMatmul_RowSizes.extracted+0x919> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
Function | hypre_ParMatmul_RowSizes.extracted |
Source file and lines | par_csr_matop.c:109-231 |
Module | exec |
nb instructions | 53 |
nb uops | 53 |
loop length | 204 |
used x86 registers | 7 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 10 |
micro-operation queue | 8.83 cycles |
front end | 8.83 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 4.50 | 4.00 | 6.00 | 6.00 | 0.50 | 4.00 | 4.50 | 0.50 | 0.50 | 0.50 | 4.00 | 6.00 |
cycles | 4.50 | 4.00 | 6.00 | 6.00 | 0.50 | 4.00 | 4.50 | 0.50 | 0.50 | 0.50 | 4.00 | 6.00 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 16.48-17.61 |
Stall cycles | 7.21-8.32 |
LM full (events) | 9.79-11.37 |
Front-end | 8.83 |
Dispatch | 6.00 |
Overall L1 | 8.83 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 12% |
load | 12% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MOV -0x30(%RBP),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x70(%RBP),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA 0x1(%RDI),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV -0x38(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
CMP %RCX,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JE 4bc1d0 <hypre_ParMatmul_RowSizes.extracted+0x270> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV %RAX,-0x70(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
ADD %R11,%RAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV -0x88(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX,%RAX,8),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x20(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX,%RDI,8),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x8(%RCX,%RDI,8),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %R13,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SUB %RAX,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JLE 4bc7e4 <hypre_ParMatmul_RowSizes.extracted+0x884> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
CMP $0x8,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JAE 4bc6d0 <hypre_ParMatmul_RowSizes.extracted+0x770> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV %R9,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
AND $-0x8,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
CMP %R9,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JAE 4bc7e0 <hypre_ParMatmul_RowSizes.extracted+0x880> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
ADD %RCX,%RAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV 0x28(%RBP),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x30(%RBP),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
JMP 4bc6bc <hypre_ParMatmul_RowSizes.extracted+0x75c> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
MOV %R9,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SHR $0x3,%RCX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
MOV -0x80(%RBP),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%R11,%RAX,8),%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 4bc6f9 <hypre_ParMatmul_RowSizes.extracted+0x799> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
MOV -0x30(%RBP),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x30(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX,%RDI,8),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x8(%RCX,%RDI,8),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RCX,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SUB %RAX,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JLE 4bc644 <hypre_ParMatmul_RowSizes.extracted+0x6e4> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
CMP $0x4,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JAE 4bc850 <hypre_ParMatmul_RowSizes.extracted+0x8f0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV %RDI,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
AND $-0x4,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
CMP %RDI,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JAE 4bc640 <hypre_ParMatmul_RowSizes.extracted+0x6e0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
ADD %R9,%RAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV 0x38(%RBP),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x30(%RBP),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
JMP 4bc82c <hypre_ParMatmul_RowSizes.extracted+0x8cc> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
MOV %RDI,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SHR $0x2,%R9 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
MOV -0xc8(%RBP),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%R11,%RAX,8),%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 4bc879 <hypre_ParMatmul_RowSizes.extracted+0x919> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |