Loop Id: 4061 | Module: exec | Source: csr_matvec.c:181-206 [...] | Coverage: 0.01% |
---|
Loop Id: 4061 | Module: exec | Source: csr_matvec.c:181-206 [...] | Coverage: 0.01% |
---|
0x4ce240 MOV -0x40(%RBP),%RAX |
0x4ce244 MOV (%RAX),%RAX |
0x4ce247 VADDSD (%RAX,%RDI,8),%XMM1,%XMM1 |
0x4ce24c VMOVSD %XMM1,(%RAX,%RDI,8) |
0x4ce251 MOV -0x98(%RBP),%RCX |
0x4ce258 LEA 0x1(%RCX),%RAX |
0x4ce25c CMP -0x80(%RBP),%RCX |
0x4ce260 MOV %RAX,%RCX |
0x4ce263 JE 4ce204 |
0x4ce265 MOV -0x88(%RBP),%RAX |
0x4ce26c MOV %RCX,-0x98(%RBP) |
0x4ce273 ADD %RCX,%RAX |
0x4ce276 MOV -0x90(%RBP),%RCX |
0x4ce27d MOV (%RCX,%RAX,8),%RDI |
0x4ce281 CMPQ $0x1,-0x48(%RBP) |
0x4ce286 JNE 4ce330 |
0x4ce28c MOV -0x38(%RBP),%RAX |
0x4ce290 MOV (%RAX),%RAX |
0x4ce293 MOV (%RAX,%RDI,8),%R10 |
0x4ce297 MOV 0x8(%RAX,%RDI,8),%RAX |
0x4ce29c VXORPD %XMM1,%XMM1,%XMM1 |
0x4ce2a0 MOV %RAX,%R11 |
0x4ce2a3 SUB %R10,%R11 |
0x4ce2a6 JLE 4ce240 |
0x4ce2a8 MOV -0x50(%RBP),%RCX |
0x4ce2ac MOV (%RCX),%RCX |
0x4ce2af MOV -0x60(%RBP),%RDX |
0x4ce2b3 MOV (%RDX),%RSI |
0x4ce2b6 MOV -0x58(%RBP),%RDX |
0x4ce2ba MOV (%RDX),%R8 |
0x4ce2bd MOV %R11,%R9 |
0x4ce2c0 AND $-0x4,%R9 |
0x4ce2c4 JE 4ce494 |
0x4ce2ca LEA -0x1(%R9),%RDX |
0x4ce2ce LEA (%RCX,%R10,8),%RBX |
0x4ce2d2 LEA (%R8,%R10,8),%R14 |
0x4ce2d6 VXORPD %XMM1,%XMM1,%XMM1 |
0x4ce2da XOR %R15D,%R15D |
0x4ce2dd NOPL (%RAX) |
(4066) 0x4ce2e0 VMOVUPD (%R14,%R15,8),%YMM2 |
(4066) 0x4ce2e6 KXNORW %K0,%K0,%K1 |
(4066) 0x4ce2ea VXORPD %XMM3,%XMM3,%XMM3 |
(4066) 0x4ce2ee VGATHERQPD (%RSI,%YMM2,8),%YMM3{%K1} |
(4066) 0x4ce2f5 VFMADD231PD (%RBX,%R15,8),%YMM3,%YMM1 |
(4066) 0x4ce2fb ADD $0x4,%R15 |
(4066) 0x4ce2ff CMP %RDX,%R15 |
(4066) 0x4ce302 JBE 4ce2e0 |
0x4ce304 VEXTRACTF128 $0x1,%YMM1,%XMM2 |
0x4ce30a VADDPD %XMM2,%XMM1,%XMM1 |
0x4ce30e VSHUFPD $0x1,%XMM1,%XMM1,%XMM2 |
0x4ce313 VADDSD %XMM2,%XMM1,%XMM1 |
0x4ce317 CMP %R9,%R11 |
0x4ce31a JE 4ce240 |
0x4ce320 JMP 4ce497 |
0x4ce330 JL 4ce251 |
0x4ce336 MOV -0x38(%RBP),%RAX |
0x4ce33a MOV (%RAX),%RAX |
0x4ce33d MOV (%RAX,%RDI,8),%R8 |
0x4ce341 MOV 0x8(%RAX,%RDI,8),%R9 |
0x4ce346 MOV -0x40(%RBP),%RAX |
0x4ce34a MOV (%RAX),%R10 |
0x4ce34d MOV %R9,%RBX |
0x4ce350 SUB %R8,%RBX |
0x4ce353 IMUL -0x78(%RBP),%RDI |
0x4ce358 MOV %RBX,-0xa8(%RBP) |
0x4ce35f AND $-0x4,%RBX |
0x4ce363 LEA -0x1(%RBX),%RCX |
0x4ce367 LEA (,%R8,8),%RAX |
0x4ce36f MOV %RAX,-0xa0(%RBP) |
0x4ce376 XOR %EDX,%EDX |
0x4ce378 JMP 4ce3ae |
(4062) 0x4ce380 MOV %RDX,%RAX |
(4062) 0x4ce383 IMUL -0xb8(%RBP),%RAX |
(4062) 0x4ce38b ADD %RDI,%RAX |
(4062) 0x4ce38e VADDSD (%R10,%RAX,8),%XMM1,%XMM1 |
(4062) 0x4ce394 VMOVSD %XMM1,(%R10,%RAX,8) |
(4062) 0x4ce39a LEA 0x1(%RDX),%RAX |
(4062) 0x4ce39e CMP -0xb0(%RBP),%RDX |
(4062) 0x4ce3a5 MOV %RAX,%RDX |
(4062) 0x4ce3a8 JE 4ce251 |
(4062) 0x4ce3ae VXORPD %XMM1,%XMM1,%XMM1 |
(4062) 0x4ce3b2 CMP %R8,%R9 |
(4062) 0x4ce3b5 JLE 4ce380 |
(4062) 0x4ce3b7 MOV -0x50(%RBP),%RAX |
(4062) 0x4ce3bb MOV (%RAX),%R15 |
(4062) 0x4ce3be MOV -0x60(%RBP),%RAX |
(4062) 0x4ce3c2 MOV (%RAX),%RSI |
(4062) 0x4ce3c5 MOV -0x58(%RBP),%RAX |
(4062) 0x4ce3c9 MOV (%RAX),%RAX |
(4062) 0x4ce3cc TEST %RBX,%RBX |
(4062) 0x4ce3cf JE 4ce453 |
(4062) 0x4ce3d5 MOV %RDX,%R11 |
(4062) 0x4ce3d8 IMUL -0x68(%RBP),%R11 |
(4062) 0x4ce3dd VPBROADCASTQ %R11,%YMM2 |
(4062) 0x4ce3e3 MOV -0xa0(%RBP),%R14 |
(4062) 0x4ce3ea LEA (%R15,%R14,1),%R11 |
(4062) 0x4ce3ee LEA (%RAX,%R14,1),%R13 |
(4062) 0x4ce3f2 VXORPD %XMM1,%XMM1,%XMM1 |
(4062) 0x4ce3f6 XOR %R14D,%R14D |
(4062) 0x4ce3f9 NOPL (%RAX) |
(4064) 0x4ce400 VXORPS %XMM3,%XMM3,%XMM3 |
(4064) 0x4ce404 VPMULLQ (%R13,%R14,8),%YMM0,%YMM3 |
(4064) 0x4ce40c VPADDQ %YMM2,%YMM3,%YMM3 |
(4064) 0x4ce410 VXORPD %XMM4,%XMM4,%XMM4 |
(4064) 0x4ce414 KXNORW %K0,%K0,%K1 |
(4064) 0x4ce418 VGATHERQPD (%RSI,%YMM3,8),%YMM4{%K1} |
(4064) 0x4ce41f VFMADD231PD (%R11,%R14,8),%YMM4,%YMM1 |
(4064) 0x4ce425 ADD $0x4,%R14 |
(4064) 0x4ce429 CMP %RCX,%R14 |
(4064) 0x4ce42c JBE 4ce400 |
(4062) 0x4ce42e VEXTRACTF128 $0x1,%YMM1,%XMM2 |
(4062) 0x4ce434 VADDPD %XMM2,%XMM1,%XMM1 |
(4062) 0x4ce438 VSHUFPD $0x1,%XMM1,%XMM1,%XMM2 |
(4062) 0x4ce43d VADDSD %XMM2,%XMM1,%XMM1 |
(4062) 0x4ce441 MOV %RBX,%R11 |
(4062) 0x4ce444 CMP %RBX,-0xa8(%RBP) |
(4062) 0x4ce44b JE 4ce380 |
(4062) 0x4ce451 JMP 4ce456 |
(4062) 0x4ce453 XOR %R11D,%R11D |
(4062) 0x4ce456 MOV %RDX,%R13 |
(4062) 0x4ce459 IMUL -0x68(%RBP),%R13 |
(4062) 0x4ce45e ADD %R8,%R11 |
(4062) 0x4ce461 NOPW %CS:(%RAX,%RAX,1) |
(4063) 0x4ce470 MOV (%RAX,%R11,8),%R14 |
(4063) 0x4ce474 IMUL %R12,%R14 |
(4063) 0x4ce478 ADD %R13,%R14 |
(4063) 0x4ce47b VMOVSD (%RSI,%R14,8),%XMM2 |
(4063) 0x4ce481 VFMADD231SD (%R15,%R11,8),%XMM2,%XMM1 |
(4063) 0x4ce487 INC %R11 |
(4063) 0x4ce48a CMP %R11,%R9 |
(4063) 0x4ce48d JNE 4ce470 |
(4062) 0x4ce48f JMP 4ce380 |
0x4ce494 XOR %R9D,%R9D |
0x4ce497 ADD %R10,%R9 |
0x4ce49a NOPW (%RAX,%RAX,1) |
(4065) 0x4ce4a0 MOV (%R8,%R9,8),%RDX |
(4065) 0x4ce4a4 VMOVSD (%RSI,%RDX,8),%XMM2 |
(4065) 0x4ce4a9 VFMADD231SD (%RCX,%R9,8),%XMM2,%XMM1 |
(4065) 0x4ce4af INC %R9 |
(4065) 0x4ce4b2 CMP %R9,%RAX |
(4065) 0x4ce4b5 JNE 4ce4a0 |
0x4ce4b7 JMP 4ce240 |
/scratch_na/users/xoserete/qaas_runs/171-415-3661/intel/AMG/build/AMG/AMG/seq_mv/csr_matvec.c: 181 - 206 |
-------------------------------------------------------------------------------- |
181: for (i = 0; i < num_rownnz; i++) |
182: { |
183: m = A_rownnz[i]; |
[...] |
191: if ( num_vectors==1 ) |
192: { |
193: tempx = 0; |
194: for (jj = A_i[m]; jj < A_i[m+1]; jj++) |
195: tempx += A_data[jj] * x_data[A_j[jj]]; |
196: y_data[m] += tempx; |
197: } |
198: else |
199: for ( j=0; j<num_vectors; ++j ) |
200: { |
201: tempx = 0; |
202: for (jj = A_i[m]; jj < A_i[m+1]; jj++) |
203: tempx += A_data[jj] * x_data[ j*vecstride_x + A_j[jj]*idxstride_x ]; |
204: y_data[ j*vecstride_y + m*idxstride_y] += tempx; |
205: } |
206: } |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►100.00+ | __kmp_invoke_microtask | libiomp5.so | |
○ | __kmp_invoke_task_func | libiomp5.so |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 3.24 |
CQA speedup if FP arith vectorized | 3.61 |
CQA speedup if fully vectorized | 12.32 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.31 |
Bottlenecks | micro-operation queue, |
Function | hypre_CSRMatrixMatvecOutOfPlace.extracted.19 |
Source | csr_matvec.c:181-183,csr_matvec.c:191-191,csr_matvec.c:194-196,csr_matvec.c:199-199,csr_matvec.c:202-206 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 11.33 |
CQA cycles if no scalar integer | 3.50 |
CQA cycles if FP arith vectorized | 3.14 |
CQA cycles if fully vectorized | 0.92 |
Front-end cycles | 11.33 |
DIV/SQRT cycles | 4.20 |
P0 cycles | 4.20 |
P1 cycles | 8.67 |
P2 cycles | 8.67 |
P3 cycles | 2.00 |
P4 cycles | 4.20 |
P5 cycles | 4.20 |
P6 cycles | 2.00 |
P7 cycles | 2.00 |
P8 cycles | 2.00 |
P9 cycles | 4.20 |
P10 cycles | 8.67 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | 13.24 |
Stall cycles (UFS) | 1.09 |
Nb insns | 67.00 |
Nb uops | 67.00 |
Nb loads | 26.00 |
Nb stores | 4.00 |
Nb stack references | 13.00 |
FLOP/cycle | 0.35 |
Nb FLOP add-sub | 4.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 21.18 |
Bytes prefetched | 0.00 |
Bytes loaded | 208.00 |
Bytes stored | 32.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 27.78 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 25.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 44.44 |
Vector-efficiency ratio all | 15.63 |
Vector-efficiency ratio load | 12.50 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 15.63 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 17.36 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 3.24 |
CQA speedup if FP arith vectorized | 3.61 |
CQA speedup if fully vectorized | 12.32 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.31 |
Bottlenecks | micro-operation queue, |
Function | hypre_CSRMatrixMatvecOutOfPlace.extracted.19 |
Source | csr_matvec.c:181-183,csr_matvec.c:191-191,csr_matvec.c:194-196,csr_matvec.c:199-199,csr_matvec.c:202-206 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 11.33 |
CQA cycles if no scalar integer | 3.50 |
CQA cycles if FP arith vectorized | 3.14 |
CQA cycles if fully vectorized | 0.92 |
Front-end cycles | 11.33 |
DIV/SQRT cycles | 4.20 |
P0 cycles | 4.20 |
P1 cycles | 8.67 |
P2 cycles | 8.67 |
P3 cycles | 2.00 |
P4 cycles | 4.20 |
P5 cycles | 4.20 |
P6 cycles | 2.00 |
P7 cycles | 2.00 |
P8 cycles | 2.00 |
P9 cycles | 4.20 |
P10 cycles | 8.67 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | 13.24 |
Stall cycles (UFS) | 1.09 |
Nb insns | 67.00 |
Nb uops | 67.00 |
Nb loads | 26.00 |
Nb stores | 4.00 |
Nb stack references | 13.00 |
FLOP/cycle | 0.35 |
Nb FLOP add-sub | 4.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 21.18 |
Bytes prefetched | 0.00 |
Bytes loaded | 208.00 |
Bytes stored | 32.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 27.78 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 25.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 44.44 |
Vector-efficiency ratio all | 15.63 |
Vector-efficiency ratio load | 12.50 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 15.63 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 17.36 |
Path / |
Function | hypre_CSRMatrixMatvecOutOfPlace.extracted.19 |
Source file and lines | csr_matvec.c:181-206 |
Module | exec |
nb instructions | 67 |
nb uops | 67 |
loop length | 284 |
used x86 registers | 13 |
used mmx registers | 0 |
used xmm registers | 2 |
used ymm registers | 1 |
used zmm registers | 0 |
nb stack references | 13 |
micro-operation queue | 11.33 cycles |
front end | 11.33 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 4.20 | 4.20 | 8.67 | 8.67 | 2.00 | 4.20 | 4.20 | 2.00 | 2.00 | 2.00 | 4.20 | 8.67 |
cycles | 4.20 | 4.20 | 8.67 | 8.67 | 2.00 | 4.20 | 4.20 | 2.00 | 2.00 | 2.00 | 4.20 | 8.67 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 13.24 |
Stall cycles | 1.09 |
LB full (events) | 1.61-1.68 |
LM full (events) | 0.42-0.35 |
Front-end | 11.33 |
Dispatch | 8.67 |
Overall L1 | 11.33 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 62% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 33% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 100% |
all | 27% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 25% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 44% |
all | 11% |
load | 12% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 11% |
all | 20% |
load | 12% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 16% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 25% |
all | 15% |
load | 12% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 15% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 17% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MOV -0x40(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RAX),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VADDSD (%RAX,%RDI,8),%XMM1,%XMM1 | 1 | 0 | 0.50 | 0.33 | 0.33 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.33 | 3 | 0.50 |
VMOVSD %XMM1,(%RAX,%RDI,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV -0x98(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA 0x1(%RCX),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
CMP -0x80(%RBP),%RCX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
MOV %RAX,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
JE 4ce204 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0xc4> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV -0x88(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RCX,-0x98(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
ADD %RCX,%RAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV -0x90(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX,%RAX,8),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
CMPQ $0x1,-0x48(%RBP) | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
JNE 4ce330 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x1f0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV -0x38(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RAX),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RAX,%RDI,8),%R10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x8(%RAX,%RDI,8),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VXORPD %XMM1,%XMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %RAX,%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SUB %R10,%R11 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JLE 4ce240 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x100> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV -0x50(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x60(%RBP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RDX),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x58(%RBP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RDX),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %R11,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
AND $-0x4,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
JE 4ce494 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x354> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
LEA -0x1(%R9),%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA (%RCX,%R10,8),%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA (%R8,%R10,8),%R14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VXORPD %XMM1,%XMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
XOR %R15D,%R15D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VEXTRACTF128 $0x1,%YMM1,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VADDPD %XMM2,%XMM1,%XMM1 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VSHUFPD $0x1,%XMM1,%XMM1,%XMM2 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VADDSD %XMM2,%XMM1,%XMM1 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
CMP %R9,%R11 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JE 4ce240 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x100> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
JMP 4ce497 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x357> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
JL 4ce251 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x111> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV -0x38(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RAX),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RAX,%RDI,8),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x8(%RAX,%RDI,8),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x40(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RAX),%R10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %R9,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SUB %R8,%RBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
IMUL -0x78(%RBP),%RDI | 1 | 0 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 3 | 1 |
MOV %RBX,-0xa8(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
AND $-0x4,%RBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
LEA -0x1(%RBX),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA (,%R8,8),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %RAX,-0xa0(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 4ce3ae <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x26e> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
XOR %R9D,%R9D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD %R10,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 4ce240 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x100> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
Function | hypre_CSRMatrixMatvecOutOfPlace.extracted.19 |
Source file and lines | csr_matvec.c:181-206 |
Module | exec |
nb instructions | 67 |
nb uops | 67 |
loop length | 284 |
used x86 registers | 13 |
used mmx registers | 0 |
used xmm registers | 2 |
used ymm registers | 1 |
used zmm registers | 0 |
nb stack references | 13 |
micro-operation queue | 11.33 cycles |
front end | 11.33 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 4.20 | 4.20 | 8.67 | 8.67 | 2.00 | 4.20 | 4.20 | 2.00 | 2.00 | 2.00 | 4.20 | 8.67 |
cycles | 4.20 | 4.20 | 8.67 | 8.67 | 2.00 | 4.20 | 4.20 | 2.00 | 2.00 | 2.00 | 4.20 | 8.67 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 13.24 |
Stall cycles | 1.09 |
LB full (events) | 1.61-1.68 |
LM full (events) | 0.42-0.35 |
Front-end | 11.33 |
Dispatch | 8.67 |
Overall L1 | 11.33 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 62% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 33% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 100% |
all | 27% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 25% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 44% |
all | 11% |
load | 12% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 11% |
all | 20% |
load | 12% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 16% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 25% |
all | 15% |
load | 12% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 15% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 17% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MOV -0x40(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RAX),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VADDSD (%RAX,%RDI,8),%XMM1,%XMM1 | 1 | 0 | 0.50 | 0.33 | 0.33 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.33 | 3 | 0.50 |
VMOVSD %XMM1,(%RAX,%RDI,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV -0x98(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA 0x1(%RCX),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
CMP -0x80(%RBP),%RCX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
MOV %RAX,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
JE 4ce204 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0xc4> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV -0x88(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RCX,-0x98(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
ADD %RCX,%RAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV -0x90(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX,%RAX,8),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
CMPQ $0x1,-0x48(%RBP) | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
JNE 4ce330 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x1f0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV -0x38(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RAX),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RAX,%RDI,8),%R10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x8(%RAX,%RDI,8),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VXORPD %XMM1,%XMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %RAX,%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SUB %R10,%R11 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JLE 4ce240 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x100> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV -0x50(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x60(%RBP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RDX),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x58(%RBP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RDX),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %R11,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
AND $-0x4,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
JE 4ce494 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x354> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
LEA -0x1(%R9),%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA (%RCX,%R10,8),%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA (%R8,%R10,8),%R14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VXORPD %XMM1,%XMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
XOR %R15D,%R15D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VEXTRACTF128 $0x1,%YMM1,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VADDPD %XMM2,%XMM1,%XMM1 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VSHUFPD $0x1,%XMM1,%XMM1,%XMM2 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VADDSD %XMM2,%XMM1,%XMM1 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
CMP %R9,%R11 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JE 4ce240 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x100> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
JMP 4ce497 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x357> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
JL 4ce251 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x111> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV -0x38(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RAX),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RAX,%RDI,8),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x8(%RAX,%RDI,8),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x40(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RAX),%R10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %R9,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SUB %R8,%RBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
IMUL -0x78(%RBP),%RDI | 1 | 0 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 3 | 1 |
MOV %RBX,-0xa8(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
AND $-0x4,%RBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
LEA -0x1(%RBX),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA (,%R8,8),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %RAX,-0xa0(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 4ce3ae <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x26e> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
XOR %R9D,%R9D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD %R10,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 4ce240 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x100> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |