Loop Id: 2946 | Module: exec | Source: par_csr_matop.c:109-231 [...] | Coverage: 0.57% |
---|
Loop Id: 2946 | Module: exec | Source: par_csr_matop.c:109-231 [...] | Coverage: 0.57% |
---|
0x4cc000 MOV -0x30(%RBP),%R11 |
0x4cc004 CMP %R11,%R12 |
0x4cc007 LEA 0x1(%R12),%R12 |
0x4cc00c MOV -0x50(%RBP),%R13 |
0x4cc010 JE 4cbb80 |
0x4cc016 LEA (%R12,%R13,1),%RAX |
0x4cc01a MOV -0x70(%RBP),%RCX |
0x4cc01e MOV (%RCX,%RAX,8),%RDI |
0x4cc022 MOV 0x20(%RBP),%RCX |
0x4cc026 MOV (%RCX,%RDI,8),%RAX |
0x4cc02a MOV 0x8(%RCX,%RDI,8),%R13 |
0x4cc02f MOV %R13,%R9 |
0x4cc032 SUB %RAX,%R9 |
0x4cc035 JLE 4cc194 |
0x4cc03b CMP $0x8,%R9 |
0x4cc03f JAE 4cc080 |
0x4cc041 MOV %R9,%RCX |
0x4cc044 AND $-0x8,%RCX |
0x4cc048 CMP %R9,%RCX |
0x4cc04b JAE 4cc190 |
0x4cc051 ADD %RCX,%RAX |
0x4cc054 MOV 0x28(%RBP),%RSI |
0x4cc058 MOV -0x30(%RBP),%R11 |
0x4cc05c JMP 4cc06c |
(2949) 0x4cc060 INC %RAX |
(2949) 0x4cc063 CMP %RAX,%R13 |
(2949) 0x4cc066 JE 4cc194 |
(2949) 0x4cc06c MOV (%RSI,%RAX,8),%RCX |
(2949) 0x4cc070 CMP %R8,(%R14,%RCX,8) |
(2949) 0x4cc074 JGE 4cc060 |
(2949) 0x4cc076 MOV %R15,(%R14,%RCX,8) |
(2949) 0x4cc07a INC %R15 |
(2949) 0x4cc07d JMP 4cc060 |
0x4cc080 MOV %R9,%RCX |
0x4cc083 SHR $0x3,%RCX |
0x4cc087 MOV -0x68(%RBP),%RSI |
0x4cc08b LEA (%RSI,%RAX,8),%R11 |
0x4cc08f JMP 4cc0a9 |
(2950) 0x4cc0a0 ADD $0x40,%R11 |
(2950) 0x4cc0a4 DEC %RCX |
(2950) 0x4cc0a7 JE 4cc041 |
(2950) 0x4cc0a9 MOV -0x38(%R11),%RSI |
(2950) 0x4cc0ad CMP %R8,(%R14,%RSI,8) |
(2950) 0x4cc0b1 JGE 4cc120 |
(2950) 0x4cc0b3 MOV %R15,(%R14,%RSI,8) |
(2950) 0x4cc0b7 INC %R15 |
(2950) 0x4cc0ba MOV -0x30(%R11),%RSI |
(2950) 0x4cc0be CMP %R8,(%R14,%RSI,8) |
(2950) 0x4cc0c2 JL 4cc12a |
(2950) 0x4cc0c4 MOV -0x28(%R11),%RSI |
(2950) 0x4cc0c8 CMP %R8,(%R14,%RSI,8) |
(2950) 0x4cc0cc JGE 4cc13b |
(2950) 0x4cc0ce MOV %R15,(%R14,%RSI,8) |
(2950) 0x4cc0d2 INC %R15 |
(2950) 0x4cc0d5 MOV -0x20(%R11),%RSI |
(2950) 0x4cc0d9 CMP %R8,(%R14,%RSI,8) |
(2950) 0x4cc0dd JL 4cc145 |
(2950) 0x4cc0df MOV -0x18(%R11),%RSI |
(2950) 0x4cc0e3 CMP %R8,(%R14,%RSI,8) |
(2950) 0x4cc0e7 JGE 4cc156 |
(2950) 0x4cc0e9 MOV %R15,(%R14,%RSI,8) |
(2950) 0x4cc0ed INC %R15 |
(2950) 0x4cc0f0 MOV -0x10(%R11),%RSI |
(2950) 0x4cc0f4 CMP %R8,(%R14,%RSI,8) |
(2950) 0x4cc0f8 JL 4cc160 |
(2950) 0x4cc0fa MOV -0x8(%R11),%RSI |
(2950) 0x4cc0fe CMP %R8,(%R14,%RSI,8) |
(2950) 0x4cc102 JGE 4cc171 |
(2950) 0x4cc104 MOV %R15,(%R14,%RSI,8) |
(2950) 0x4cc108 INC %R15 |
(2950) 0x4cc10b MOV (%R11),%RSI |
(2950) 0x4cc10e CMP %R8,(%R14,%RSI,8) |
(2950) 0x4cc112 JGE 4cc0a0 |
(2950) 0x4cc114 JMP 4cc17e |
(2950) 0x4cc120 MOV -0x30(%R11),%RSI |
(2950) 0x4cc124 CMP %R8,(%R14,%RSI,8) |
(2950) 0x4cc128 JGE 4cc0c4 |
(2950) 0x4cc12a MOV %R15,(%R14,%RSI,8) |
(2950) 0x4cc12e INC %R15 |
(2950) 0x4cc131 MOV -0x28(%R11),%RSI |
(2950) 0x4cc135 CMP %R8,(%R14,%RSI,8) |
(2950) 0x4cc139 JL 4cc0ce |
(2950) 0x4cc13b MOV -0x20(%R11),%RSI |
(2950) 0x4cc13f CMP %R8,(%R14,%RSI,8) |
(2950) 0x4cc143 JGE 4cc0df |
(2950) 0x4cc145 MOV %R15,(%R14,%RSI,8) |
(2950) 0x4cc149 INC %R15 |
(2950) 0x4cc14c MOV -0x18(%R11),%RSI |
(2950) 0x4cc150 CMP %R8,(%R14,%RSI,8) |
(2950) 0x4cc154 JL 4cc0e9 |
(2950) 0x4cc156 MOV -0x10(%R11),%RSI |
(2950) 0x4cc15a CMP %R8,(%R14,%RSI,8) |
(2950) 0x4cc15e JGE 4cc0fa |
(2950) 0x4cc160 MOV %R15,(%R14,%RSI,8) |
(2950) 0x4cc164 INC %R15 |
(2950) 0x4cc167 MOV -0x8(%R11),%RSI |
(2950) 0x4cc16b CMP %R8,(%R14,%RSI,8) |
(2950) 0x4cc16f JL 4cc104 |
(2950) 0x4cc171 MOV (%R11),%RSI |
(2950) 0x4cc174 CMP %R8,(%R14,%RSI,8) |
(2950) 0x4cc178 JGE 4cc0a0 |
(2950) 0x4cc17e MOV %R15,(%R14,%RSI,8) |
(2950) 0x4cc182 INC %R15 |
(2950) 0x4cc185 JMP 4cc0a0 |
0x4cc190 MOV -0x30(%RBP),%R11 |
0x4cc194 MOV 0x30(%RBP),%RCX |
0x4cc198 MOV (%RCX,%RDI,8),%RAX |
0x4cc19c MOV 0x8(%RCX,%RDI,8),%RCX |
0x4cc1a1 MOV %RCX,%RDI |
0x4cc1a4 SUB %RAX,%RDI |
0x4cc1a7 JLE 4cc004 |
0x4cc1ad CMP $0x4,%RDI |
0x4cc1b1 JAE 4cc210 |
0x4cc1b3 MOV 0x60(%RBP),%R13 |
0x4cc1b7 MOV %RDI,%RSI |
0x4cc1ba AND $-0x4,%RSI |
0x4cc1be CMP %RDI,%RSI |
0x4cc1c1 JAE 4cc000 |
0x4cc1c7 ADD %RSI,%RAX |
0x4cc1ca MOV 0x38(%RBP),%RDI |
0x4cc1ce MOV -0x30(%RBP),%R11 |
0x4cc1d2 JMP 4cc1ec |
(2947) 0x4cc1e0 INC %RAX |
(2947) 0x4cc1e3 CMP %RAX,%RCX |
(2947) 0x4cc1e6 JE 4cc004 |
(2947) 0x4cc1ec MOV (%RDI,%RAX,8),%RSI |
(2947) 0x4cc1f0 MOV (%R13,%RSI,8),%RSI |
(2947) 0x4cc1f5 ADD %RBX,%RSI |
(2947) 0x4cc1f8 CMP %R10,(%R14,%RSI,8) |
(2947) 0x4cc1fc JGE 4cc1e0 |
(2947) 0x4cc1fe MOV %RDX,(%R14,%RSI,8) |
(2947) 0x4cc202 INC %RDX |
(2947) 0x4cc205 JMP 4cc1e0 |
0x4cc210 MOV %RDI,%R9 |
0x4cc213 SHR $0x2,%R9 |
0x4cc217 MOV -0xc0(%RBP),%RSI |
0x4cc21e LEA (%RSI,%RAX,8),%R11 |
0x4cc222 MOV 0x60(%RBP),%R13 |
0x4cc226 JMP 4cc23d |
(2948) 0x4cc230 ADD $0x20,%R11 |
(2948) 0x4cc234 DEC %R9 |
(2948) 0x4cc237 JE 4cc1b7 |
(2948) 0x4cc23d MOV -0x18(%R11),%RSI |
(2948) 0x4cc241 MOV (%R13,%RSI,8),%RSI |
(2948) 0x4cc246 ADD %RBX,%RSI |
(2948) 0x4cc249 CMP %R10,(%R14,%RSI,8) |
(2948) 0x4cc24d JGE 4cc256 |
(2948) 0x4cc24f MOV %RDX,(%R14,%RSI,8) |
(2948) 0x4cc253 INC %RDX |
(2948) 0x4cc256 MOV -0x10(%R11),%RSI |
(2948) 0x4cc25a MOV (%R13,%RSI,8),%RSI |
(2948) 0x4cc25f ADD %RBX,%RSI |
(2948) 0x4cc262 CMP %R10,(%R14,%RSI,8) |
(2948) 0x4cc266 JGE 4cc26f |
(2948) 0x4cc268 MOV %RDX,(%R14,%RSI,8) |
(2948) 0x4cc26c INC %RDX |
(2948) 0x4cc26f MOV -0x8(%R11),%RSI |
(2948) 0x4cc273 MOV (%R13,%RSI,8),%RSI |
(2948) 0x4cc278 ADD %RBX,%RSI |
(2948) 0x4cc27b CMP %R10,(%R14,%RSI,8) |
(2948) 0x4cc27f JGE 4cc288 |
(2948) 0x4cc281 MOV %RDX,(%R14,%RSI,8) |
(2948) 0x4cc285 INC %RDX |
(2948) 0x4cc288 MOV (%R11),%RSI |
(2948) 0x4cc28b MOV (%R13,%RSI,8),%RSI |
(2948) 0x4cc290 ADD %RBX,%RSI |
(2948) 0x4cc293 CMP %R10,(%R14,%RSI,8) |
(2948) 0x4cc297 JGE 4cc230 |
(2948) 0x4cc299 MOV %RDX,(%R14,%RSI,8) |
(2948) 0x4cc29d INC %RDX |
(2948) 0x4cc2a0 JMP 4cc230 |
/scratch_na/users/xoserete/qaas_runs/171-172-8218/intel/AMG/build/AMG/AMG/parcsr_mv/par_csr_matop.c: 109 - 231 |
-------------------------------------------------------------------------------- |
109: if (ii < rest) |
[...] |
187: for (jj2 = A_diag_i[i1]; jj2 < A_diag_i[i1+1]; jj2++) |
188: { |
189: i2 = A_diag_j[jj2]; |
[...] |
195: for (jj3 = B_diag_i[i2]; jj3 < B_diag_i[i2+1]; jj3++) |
196: { |
197: i3 = B_diag_j[jj3]; |
[...] |
205: if (B_marker[i3] < jj_row_begin_diag) |
206: { |
207: B_marker[i3] = jj_count_diag; |
208: jj_count_diag++; |
[...] |
216: if (num_cols_offd_B) |
217: { |
218: for (jj3 = B_offd_i[i2]; jj3 < B_offd_i[i2+1]; jj3++) |
219: { |
220: i3 = num_cols_diag_B+map_B_to_C[B_offd_j[jj3]]; |
[...] |
228: if (B_marker[i3] < jj_row_begin_offd) |
229: { |
230: B_marker[i3] = jj_count_offd; |
231: jj_count_offd++; |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 8.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.39 |
Bottlenecks | micro-operation queue, |
Function | hypre_ParMatmul_RowSizes.extracted |
Source | par_csr_matop.c:109-109,par_csr_matop.c:187-189,par_csr_matop.c:195-195,par_csr_matop.c:208-208,par_csr_matop.c:216-218 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 8.83 |
CQA cycles if no scalar integer | 8.83 |
CQA cycles if FP arith vectorized | 8.83 |
CQA cycles if fully vectorized | 1.10 |
Front-end cycles | 8.83 |
DIV/SQRT cycles | 4.50 |
P0 cycles | 3.80 |
P1 cycles | 6.33 |
P2 cycles | 6.33 |
P3 cycles | 0.00 |
P4 cycles | 3.60 |
P5 cycles | 4.50 |
P6 cycles | 0.00 |
P7 cycles | 0.00 |
P8 cycles | 0.00 |
P9 cycles | 3.60 |
P10 cycles | 6.33 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | 15.34 - 16.35 |
Stall cycles (UFS) | 6.08 - 7.09 |
Nb insns | 53.00 |
Nb uops | 53.00 |
Nb loads | 19.00 |
Nb stores | 0.00 |
Nb stack references | 10.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 17.21 |
Bytes prefetched | 0.00 |
Bytes loaded | 152.00 |
Bytes stored | 0.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 0.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | NA |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 12.50 |
Vector-efficiency ratio load | 12.50 |
Vector-efficiency ratio store | NA |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 12.50 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 8.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.39 |
Bottlenecks | micro-operation queue, |
Function | hypre_ParMatmul_RowSizes.extracted |
Source | par_csr_matop.c:109-109,par_csr_matop.c:187-189,par_csr_matop.c:195-195,par_csr_matop.c:208-208,par_csr_matop.c:216-218 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 8.83 |
CQA cycles if no scalar integer | 8.83 |
CQA cycles if FP arith vectorized | 8.83 |
CQA cycles if fully vectorized | 1.10 |
Front-end cycles | 8.83 |
DIV/SQRT cycles | 4.50 |
P0 cycles | 3.80 |
P1 cycles | 6.33 |
P2 cycles | 6.33 |
P3 cycles | 0.00 |
P4 cycles | 3.60 |
P5 cycles | 4.50 |
P6 cycles | 0.00 |
P7 cycles | 0.00 |
P8 cycles | 0.00 |
P9 cycles | 3.60 |
P10 cycles | 6.33 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | 15.34 - 16.35 |
Stall cycles (UFS) | 6.08 - 7.09 |
Nb insns | 53.00 |
Nb uops | 53.00 |
Nb loads | 19.00 |
Nb stores | 0.00 |
Nb stack references | 10.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 17.21 |
Bytes prefetched | 0.00 |
Bytes loaded | 152.00 |
Bytes stored | 0.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 0.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | NA |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 12.50 |
Vector-efficiency ratio load | 12.50 |
Vector-efficiency ratio store | NA |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 12.50 |
Path / |
Function | hypre_ParMatmul_RowSizes.extracted |
Source file and lines | par_csr_matop.c:109-231 |
Module | exec |
nb instructions | 53 |
nb uops | 53 |
loop length | 203 |
used x86 registers | 9 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 10 |
micro-operation queue | 8.83 cycles |
front end | 8.83 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 4.50 | 3.80 | 6.33 | 6.33 | 0.00 | 3.60 | 4.50 | 0.00 | 0.00 | 0.00 | 3.60 | 6.33 |
cycles | 4.50 | 3.80 | 6.33 | 6.33 | 0.00 | 3.60 | 4.50 | 0.00 | 0.00 | 0.00 | 3.60 | 6.33 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 15.34-16.35 |
Stall cycles | 6.08-7.09 |
LM full (events) | 8.52-9.49 |
Front-end | 8.83 |
Dispatch | 6.33 |
Overall L1 | 8.83 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 12% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MOV -0x30(%RBP),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
CMP %R11,%R12 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
LEA 0x1(%R12),%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV -0x50(%RBP),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
JE 4cbb80 <hypre_ParMatmul_RowSizes.extracted+0x200> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
LEA (%R12,%R13,1),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV -0x70(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX,%RAX,8),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x20(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX,%RDI,8),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x8(%RCX,%RDI,8),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %R13,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SUB %RAX,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JLE 4cc194 <hypre_ParMatmul_RowSizes.extracted+0x814> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
CMP $0x8,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JAE 4cc080 <hypre_ParMatmul_RowSizes.extracted+0x700> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV %R9,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
AND $-0x8,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
CMP %R9,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JAE 4cc190 <hypre_ParMatmul_RowSizes.extracted+0x810> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
ADD %RCX,%RAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV 0x28(%RBP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x30(%RBP),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
JMP 4cc06c <hypre_ParMatmul_RowSizes.extracted+0x6ec> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
MOV %R9,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SHR $0x3,%RCX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
MOV -0x68(%RBP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%RSI,%RAX,8),%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 4cc0a9 <hypre_ParMatmul_RowSizes.extracted+0x729> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
MOV -0x30(%RBP),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x30(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX,%RDI,8),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x8(%RCX,%RDI,8),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RCX,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SUB %RAX,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JLE 4cc004 <hypre_ParMatmul_RowSizes.extracted+0x684> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
CMP $0x4,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JAE 4cc210 <hypre_ParMatmul_RowSizes.extracted+0x890> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0x60(%RBP),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RDI,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
AND $-0x4,%RSI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
CMP %RDI,%RSI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JAE 4cc000 <hypre_ParMatmul_RowSizes.extracted+0x680> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
ADD %RSI,%RAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV 0x38(%RBP),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x30(%RBP),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
JMP 4cc1ec <hypre_ParMatmul_RowSizes.extracted+0x86c> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
MOV %RDI,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SHR $0x2,%R9 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
MOV -0xc0(%RBP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%RSI,%RAX,8),%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV 0x60(%RBP),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
JMP 4cc23d <hypre_ParMatmul_RowSizes.extracted+0x8bd> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
Function | hypre_ParMatmul_RowSizes.extracted |
Source file and lines | par_csr_matop.c:109-231 |
Module | exec |
nb instructions | 53 |
nb uops | 53 |
loop length | 203 |
used x86 registers | 9 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 10 |
micro-operation queue | 8.83 cycles |
front end | 8.83 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 4.50 | 3.80 | 6.33 | 6.33 | 0.00 | 3.60 | 4.50 | 0.00 | 0.00 | 0.00 | 3.60 | 6.33 |
cycles | 4.50 | 3.80 | 6.33 | 6.33 | 0.00 | 3.60 | 4.50 | 0.00 | 0.00 | 0.00 | 3.60 | 6.33 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 15.34-16.35 |
Stall cycles | 6.08-7.09 |
LM full (events) | 8.52-9.49 |
Front-end | 8.83 |
Dispatch | 6.33 |
Overall L1 | 8.83 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 12% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MOV -0x30(%RBP),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
CMP %R11,%R12 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
LEA 0x1(%R12),%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV -0x50(%RBP),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
JE 4cbb80 <hypre_ParMatmul_RowSizes.extracted+0x200> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
LEA (%R12,%R13,1),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV -0x70(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX,%RAX,8),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x20(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX,%RDI,8),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x8(%RCX,%RDI,8),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %R13,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SUB %RAX,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JLE 4cc194 <hypre_ParMatmul_RowSizes.extracted+0x814> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
CMP $0x8,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JAE 4cc080 <hypre_ParMatmul_RowSizes.extracted+0x700> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV %R9,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
AND $-0x8,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
CMP %R9,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JAE 4cc190 <hypre_ParMatmul_RowSizes.extracted+0x810> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
ADD %RCX,%RAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV 0x28(%RBP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x30(%RBP),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
JMP 4cc06c <hypre_ParMatmul_RowSizes.extracted+0x6ec> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
MOV %R9,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SHR $0x3,%RCX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
MOV -0x68(%RBP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%RSI,%RAX,8),%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 4cc0a9 <hypre_ParMatmul_RowSizes.extracted+0x729> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
MOV -0x30(%RBP),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x30(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX,%RDI,8),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x8(%RCX,%RDI,8),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RCX,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SUB %RAX,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JLE 4cc004 <hypre_ParMatmul_RowSizes.extracted+0x684> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
CMP $0x4,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JAE 4cc210 <hypre_ParMatmul_RowSizes.extracted+0x890> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0x60(%RBP),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RDI,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
AND $-0x4,%RSI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
CMP %RDI,%RSI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JAE 4cc000 <hypre_ParMatmul_RowSizes.extracted+0x680> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
ADD %RSI,%RAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV 0x38(%RBP),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x30(%RBP),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
JMP 4cc1ec <hypre_ParMatmul_RowSizes.extracted+0x86c> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
MOV %RDI,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SHR $0x2,%R9 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
MOV -0xc0(%RBP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%RSI,%RAX,8),%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV 0x60(%RBP),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
JMP 4cc23d <hypre_ParMatmul_RowSizes.extracted+0x8bd> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |