Function: hypre_CSRMatrixMatvecOutOfPlace.extracted.19 | Module: libseq_mv.so | Source: csr_matvec.c:178-204 [...] | Coverage: 0.03% |
---|
Function: hypre_CSRMatrixMatvecOutOfPlace.extracted.19 | Module: libseq_mv.so | Source: csr_matvec.c:178-204 [...] | Coverage: 0.03% |
---|
/home/eoseret/qaas_runs_CPU_9468/171-147-2675/intel/AMG/build/AMG/AMG/seq_mv/csr_matvec.c: 178 - 204 |
-------------------------------------------------------------------------------- |
178: #pragma omp parallel for private(i,j,jj,m,tempx) HYPRE_SMP_SCHEDULE |
179: #endif |
180: |
181: for (i = 0; i < num_rownnz; i++) |
182: { |
183: m = A_rownnz[i]; |
[...] |
191: if ( num_vectors==1 ) |
192: { |
193: tempx = 0; |
194: for (jj = A_i[m]; jj < A_i[m+1]; jj++) |
195: tempx += A_data[jj] * x_data[A_j[jj]]; |
196: y_data[m] += tempx; |
197: } |
198: else |
199: for ( j=0; j<num_vectors; ++j ) |
200: { |
201: tempx = 0; |
202: for (jj = A_i[m]; jj < A_i[m+1]; jj++) |
203: tempx += A_data[jj] * x_data[ j*vecstride_x + A_j[jj]*idxstride_x ]; |
204: y_data[ j*vecstride_y + m*idxstride_y] += tempx; |
0x7490 PUSH %RBP |
0x7491 MOV %RSP,%RBP |
0x7494 PUSH %R15 |
0x7496 PUSH %R14 |
0x7498 PUSH %R13 |
0x749a PUSH %R12 |
0x749c PUSH %RBX |
0x749d SUB $0xa8,%RSP |
0x74a4 MOV 0x40(%RBP),%RBX |
0x74a8 MOV 0x38(%RBP),%R12 |
0x74ac MOV 0x30(%RBP),%R14 |
0x74b0 MOV 0x28(%RBP),%RAX |
0x74b4 MOV %RAX,-0x80(%RBP) |
0x74b8 MOV 0x20(%RBP),%RAX |
0x74bc MOV %RAX,-0x48(%RBP) |
0x74c0 MOV 0x18(%RBP),%RAX |
0x74c4 MOV %RAX,-0x40(%RBP) |
0x74c8 MOV 0x10(%RBP),%RAX |
0x74cc MOV %RAX,-0x68(%RBP) |
0x74d0 MOVL $0,-0x30(%RBP) |
0x74d7 MOV %R9,-0x98(%RBP) |
0x74de MOV %R8,-0x60(%RBP) |
0x74e2 MOV %RCX,-0x38(%RBP) |
0x74e6 MOV %RDX,-0x58(%RBP) |
0x74ea MOV (%RDI),%ESI |
0x74ec MOVQ $0,-0x78(%RBP) |
0x74f4 MOVQ $0x1,-0xc8(%RBP) |
0x74ff SUB $0x8,%RSP |
0x7503 LEA -0xc8(%RBP),%RAX |
0x750a LEA 0x9f4f(%RIP),%RDI |
0x7511 LEA -0x30(%RBP),%RCX |
0x7515 LEA -0x78(%RBP),%R8 |
0x7519 LEA 0x50(%RBP),%R9 |
0x751d MOV %ESI,-0x2c(%RBP) |
0x7520 MOV $0x22,%EDX |
0x7525 PUSH $0x1 |
0x7527 PUSH $0x1 |
0x7529 PUSH %RAX |
0x752a CALL 30a0 <__kmpc_for_static_init_8@plt> |
0x752f ADD $0x20,%RSP |
0x7533 MOV -0x78(%RBP),%RAX |
0x7537 MOV 0x50(%RBP),%RCX |
0x753b MOV %RAX,-0x90(%RBP) |
0x7542 SUB %RAX,%RCX |
0x7545 MOV %RCX,-0x88(%RBP) |
0x754c JAE 7571 |
0x754e LEA 0x9f2b(%RIP),%RDI |
0x7555 MOV -0x2c(%RBP),%ESI |
0x7558 ADD $0xa8,%RSP |
0x755f POP %RBX |
0x7560 POP %R12 |
0x7562 POP %R13 |
0x7564 POP %R14 |
0x7566 POP %R15 |
0x7568 POP %RBP |
0x7569 VZEROUPPER |
0x756c JMP 3030 |
0x7571 XOR %EDX,%EDX |
0x7573 VPBROADCASTQ %R12,%YMM0 |
0x7579 MOV %R14,-0x50(%RBP) |
0x757d JMP 75a7 |
0x757f NOP |
(114) 0x7580 MOV -0x40(%RBP),%RAX |
(114) 0x7584 MOV (%RAX),%RAX |
(114) 0x7587 VADDSD (%RAX,%R11,8),%XMM1,%XMM1 |
(114) 0x758d VMOVSD %XMM1,(%RAX,%R11,8) |
(114) 0x7593 MOV -0xa0(%RBP),%RCX |
(114) 0x759a LEA 0x1(%RCX),%RDX |
(114) 0x759e CMP -0x88(%RBP),%RCX |
(114) 0x75a5 JE 754e |
(114) 0x75a7 MOV -0x90(%RBP),%RAX |
(114) 0x75ae ADD %RDX,%RAX |
(114) 0x75b1 MOV -0x98(%RBP),%RCX |
(114) 0x75b8 MOV (%RCX,%RAX,8),%R11 |
(114) 0x75bc CMPQ $0x1,-0x48(%RBP) |
(114) 0x75c1 MOV %RDX,-0xa0(%RBP) |
(114) 0x75c8 JNE 7680 |
(114) 0x75ce MOV -0x38(%RBP),%RAX |
(114) 0x75d2 MOV (%RAX),%RAX |
(114) 0x75d5 MOV (%RAX,%R11,8),%R9 |
(114) 0x75d9 MOV 0x8(%RAX,%R11,8),%RAX |
(114) 0x75de VXORPD %XMM1,%XMM1,%XMM1 |
(114) 0x75e2 MOV %RAX,%R10 |
(114) 0x75e5 SUB %R9,%R10 |
(114) 0x75e8 JLE 7580 |
(114) 0x75ea MOV -0x58(%RBP),%RCX |
(114) 0x75ee MOV (%RCX),%RCX |
(114) 0x75f1 MOV -0x68(%RBP),%RDX |
(114) 0x75f5 MOV (%RDX),%RDX |
(114) 0x75f8 MOV -0x60(%RBP),%RSI |
(114) 0x75fc MOV (%RSI),%RDI |
(114) 0x75ff MOV %R10,%R8 |
(114) 0x7602 AND $-0x4,%R8 |
(114) 0x7606 JE 77f5 |
(114) 0x760c MOV %R11,%R13 |
(114) 0x760f LEA -0x1(%R8),%RSI |
(114) 0x7613 LEA (%RCX,%R9,8),%R11 |
(114) 0x7617 LEA (%RDI,%R9,8),%R14 |
(114) 0x761b VXORPD %XMM1,%XMM1,%XMM1 |
(114) 0x761f XOR %R15D,%R15D |
(114) 0x7622 NOPW %CS:(%RAX,%RAX,1) |
(119) 0x7630 VMOVUPD (%R14,%R15,8),%YMM2 |
(119) 0x7636 VXORPD %XMM3,%XMM3,%XMM3 |
(119) 0x763a KXNORW %K0,%K0,%K1 |
(119) 0x763e VGATHERQPD (%RDX,%YMM2,8),%YMM3{%K1} |
(119) 0x7645 VFMADD231PD (%R11,%R15,8),%YMM3,%YMM1 |
(119) 0x764b ADD $0x4,%R15 |
(119) 0x764f CMP %RSI,%R15 |
(119) 0x7652 JBE 7630 |
(114) 0x7654 VEXTRACTF128 $0x1,%YMM1,%XMM2 |
(114) 0x765a VADDPD %XMM2,%XMM1,%XMM1 |
(114) 0x765e VSHUFPD $0x1,%XMM1,%XMM1,%XMM2 |
(114) 0x7663 VADDSD %XMM2,%XMM1,%XMM1 |
(114) 0x7667 CMP %R8,%R10 |
(114) 0x766a MOV -0x50(%RBP),%R14 |
(114) 0x766e MOV %R13,%R11 |
(114) 0x7671 JE 7580 |
(114) 0x7677 JMP 77f8 |
0x767c NOPL (%RAX) |
(114) 0x7680 JL 7593 |
(114) 0x7686 MOV -0x38(%RBP),%RAX |
(114) 0x768a MOV (%RAX),%RAX |
(114) 0x768d MOV (%RAX,%R11,8),%R10 |
(114) 0x7691 MOV 0x8(%RAX,%R11,8),%R8 |
(114) 0x7696 MOV -0x40(%RBP),%RAX |
(114) 0x769a MOV (%RAX),%R9 |
(114) 0x769d MOV -0x48(%RBP),%RAX |
(114) 0x76a1 DEC %RAX |
(114) 0x76a4 MOV %RAX,-0xc0(%RBP) |
(114) 0x76ab LEA (,%R10,8),%RAX |
(114) 0x76b3 MOV %RAX,-0xa8(%RBP) |
(114) 0x76ba MOV %R8,%RAX |
(114) 0x76bd SUB %R10,%RAX |
(114) 0x76c0 MOV %RAX,-0x70(%RBP) |
(114) 0x76c4 IMUL -0x80(%RBP),%R11 |
(114) 0x76c9 XOR %EDI,%EDI |
(114) 0x76cb MOV %R11,-0xb8(%RBP) |
(114) 0x76d2 MOV %R10,-0xb0(%RBP) |
(114) 0x76d9 JMP 770a |
0x76db NOPL (%RAX,%RAX,1) |
(115) 0x76e0 MOV %RDI,%RAX |
(115) 0x76e3 IMUL %R14,%RAX |
(115) 0x76e7 ADD %R11,%RAX |
(115) 0x76ea VADDSD (%R9,%RAX,8),%XMM1,%XMM1 |
(115) 0x76f0 VMOVSD %XMM1,(%R9,%RAX,8) |
(115) 0x76f6 LEA 0x1(%RDI),%RAX |
(115) 0x76fa CMP -0xc0(%RBP),%RDI |
(115) 0x7701 MOV %RAX,%RDI |
(115) 0x7704 JE 7593 |
(115) 0x770a VXORPD %XMM1,%XMM1,%XMM1 |
(115) 0x770e CMP %R10,%R8 |
(115) 0x7711 JLE 76e0 |
(115) 0x7713 MOV -0x58(%RBP),%RAX |
(115) 0x7717 MOV (%RAX),%R15 |
(115) 0x771a MOV -0x68(%RBP),%RAX |
(115) 0x771e MOV (%RAX),%R13 |
(115) 0x7721 MOV -0x60(%RBP),%RAX |
(115) 0x7725 MOV (%RAX),%RDX |
(115) 0x7728 MOV -0x70(%RBP),%RAX |
(115) 0x772c AND $-0x4,%RAX |
(115) 0x7730 JE 77bc |
(115) 0x7736 LEA -0x1(%RAX),%R11 |
(115) 0x773a MOV -0xa8(%RBP),%RCX |
(115) 0x7741 LEA (%R15,%RCX,1),%RSI |
(115) 0x7745 LEA (%RDX,%RCX,1),%R14 |
(115) 0x7749 VXORPD %XMM1,%XMM1,%XMM1 |
(115) 0x774d XOR %R10D,%R10D |
(117) 0x7750 VXORPS %XMM2,%XMM2,%XMM2 |
(117) 0x7754 VPMULLQ (%R14,%R10,8),%YMM0,%YMM2 |
(117) 0x775b MOV %RDI,%RCX |
(117) 0x775e IMUL %RBX,%RCX |
(117) 0x7762 VPBROADCASTQ %RCX,%YMM3 |
(117) 0x7768 VPADDQ %YMM3,%YMM2,%YMM2 |
(117) 0x776c KXNORW %K0,%K0,%K1 |
(117) 0x7770 VPXOR %XMM3,%XMM3,%XMM3 |
(117) 0x7774 VGATHERQPD (%R13,%YMM2,8),%YMM3{%K1} |
(117) 0x777c VFMADD231PD (%RSI,%R10,8),%YMM3,%YMM1 |
(117) 0x7782 ADD $0x4,%R10 |
(117) 0x7786 CMP %R11,%R10 |
(117) 0x7789 JBE 7750 |
(115) 0x778b VEXTRACTF128 $0x1,%YMM1,%XMM2 |
(115) 0x7791 VADDPD %XMM2,%XMM1,%XMM1 |
(115) 0x7795 VSHUFPD $0x1,%XMM1,%XMM1,%XMM2 |
(115) 0x779a VADDSD %XMM2,%XMM1,%XMM1 |
(115) 0x779e CMP %RAX,-0x70(%RBP) |
(115) 0x77a2 MOV -0x50(%RBP),%R14 |
(115) 0x77a6 MOV -0xb8(%RBP),%R11 |
(115) 0x77ad MOV -0xb0(%RBP),%R10 |
(115) 0x77b4 JE 76e0 |
(115) 0x77ba JMP 77be |
(115) 0x77bc XOR %EAX,%EAX |
(115) 0x77be ADD %R10,%RAX |
(115) 0x77c1 MOV %RDI,%RSI |
(115) 0x77c4 IMUL %RBX,%RSI |
(115) 0x77c8 NOPL (%RAX,%RAX,1) |
(116) 0x77d0 MOV (%RDX,%RAX,8),%RCX |
(116) 0x77d4 IMUL %R12,%RCX |
(116) 0x77d8 ADD %RSI,%RCX |
(116) 0x77db VMOVSD (%R13,%RCX,8),%XMM2 |
(116) 0x77e2 VFMADD231SD (%R15,%RAX,8),%XMM2,%XMM1 |
(116) 0x77e8 INC %RAX |
(116) 0x77eb CMP %RAX,%R8 |
(116) 0x77ee JNE 77d0 |
(115) 0x77f0 JMP 76e0 |
(114) 0x77f5 XOR %R8D,%R8D |
(114) 0x77f8 ADD %R9,%R8 |
(114) 0x77fb NOPL (%RAX,%RAX,1) |
(118) 0x7800 MOV (%RDI,%R8,8),%RSI |
(118) 0x7804 VMOVSD (%RDX,%RSI,8),%XMM2 |
(118) 0x7809 VFMADD231SD (%RCX,%R8,8),%XMM2,%XMM1 |
(118) 0x780f INC %R8 |
(118) 0x7812 CMP %R8,%RAX |
(118) 0x7815 JNE 7800 |
(114) 0x7817 JMP 7580 |
0x781c NOPL (%RAX) |
Path / |
Source file and lines | csr_matvec.c:178-204 |
Module | libseq_mv.so |
nb instructions | 65 |
nb uops | 67 |
loop length | 253 |
used x86 registers | 14 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 1 |
used zmm registers | 0 |
nb stack references | 23 |
micro-operation queue | 11.17 cycles |
front end | 11.17 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 1.30 | 1.20 | 5.67 | 5.67 | 12.50 | 1.20 | 1.10 | 12.50 | 12.50 | 12.50 | 1.20 | 5.67 |
cycles | 1.30 | 1.20 | 5.67 | 5.67 | 12.50 | 1.20 | 1.10 | 12.50 | 12.50 | 12.50 | 1.20 | 5.67 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 12.59 |
Stall cycles | 1.29-1.29 |
RS full (events) | 4.00-3.92 |
Front-end | 11.17 |
Dispatch | 12.50 |
Overall L1 | 12.50 |
all | 3% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 25% |
all | 11% |
load | 10% |
store | 10% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R15 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R14 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R13 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R12 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RBX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
SUB $0xa8,%RSP | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV 0x40(%RBP),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x38(%RBP),%R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x30(%RBP),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x28(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x80(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x20(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x48(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x18(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x40(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x10(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x68(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOVL $0,-0x30(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R9,-0x98(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R8,-0x60(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RCX,-0x38(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RDX,-0x58(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV (%RDI),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVQ $0,-0x78(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOVQ $0x1,-0xc8(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
SUB $0x8,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
LEA -0xc8(%RBP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA 0x9f4f(%RIP),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA -0x30(%RBP),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA -0x78(%RBP),%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA 0x50(%RBP),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %ESI,-0x2c(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV $0x22,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
PUSH $0x1 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH $0x1 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RAX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
CALL 30a0 <__kmpc_for_static_init_8@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
ADD $0x20,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV -0x78(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x50(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x90(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
SUB %RAX,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %RCX,-0x88(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
JAE 7571 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0xe1> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
LEA 0x9f2b(%RIP),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV -0x2c(%RBP),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD $0xa8,%RSP | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
POP %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
VZEROUPPER | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
JMP 3030 <__kmpc_for_static_fini@plt> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VPBROADCASTQ %R12,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %R14,-0x50(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
JMP 75a7 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x117> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
NOP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
Source file and lines | csr_matvec.c:178-204 |
Module | libseq_mv.so |
nb instructions | 65 |
nb uops | 67 |
loop length | 253 |
used x86 registers | 14 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 1 |
used zmm registers | 0 |
nb stack references | 23 |
micro-operation queue | 11.17 cycles |
front end | 11.17 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 1.30 | 1.20 | 5.67 | 5.67 | 12.50 | 1.20 | 1.10 | 12.50 | 12.50 | 12.50 | 1.20 | 5.67 |
cycles | 1.30 | 1.20 | 5.67 | 5.67 | 12.50 | 1.20 | 1.10 | 12.50 | 12.50 | 12.50 | 1.20 | 5.67 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 12.59 |
Stall cycles | 1.29-1.29 |
RS full (events) | 4.00-3.92 |
Front-end | 11.17 |
Dispatch | 12.50 |
Overall L1 | 12.50 |
all | 3% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 25% |
all | 11% |
load | 10% |
store | 10% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R15 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R14 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R13 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R12 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RBX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
SUB $0xa8,%RSP | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV 0x40(%RBP),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x38(%RBP),%R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x30(%RBP),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x28(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x80(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x20(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x48(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x18(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x40(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x10(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x68(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOVL $0,-0x30(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R9,-0x98(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R8,-0x60(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RCX,-0x38(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RDX,-0x58(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV (%RDI),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVQ $0,-0x78(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOVQ $0x1,-0xc8(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
SUB $0x8,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
LEA -0xc8(%RBP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA 0x9f4f(%RIP),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA -0x30(%RBP),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA -0x78(%RBP),%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA 0x50(%RBP),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %ESI,-0x2c(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV $0x22,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
PUSH $0x1 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH $0x1 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RAX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
CALL 30a0 <__kmpc_for_static_init_8@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
ADD $0x20,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV -0x78(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x50(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x90(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
SUB %RAX,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %RCX,-0x88(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
JAE 7571 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0xe1> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
LEA 0x9f2b(%RIP),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV -0x2c(%RBP),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD $0xa8,%RSP | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
POP %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
VZEROUPPER | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
JMP 3030 <__kmpc_for_static_fini@plt> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VPBROADCASTQ %R12,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %R14,-0x50(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
JMP 75a7 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x117> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
NOP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
Name | Coverage (%) | Time (s) |
---|---|---|
▼hypre_CSRMatrixMatvecOutOfPlace.extracted.19– | 0.03 | 0.01 |
▼Loop 114 - csr_matvec.c:178-204 - libseq_mv.so– | 0.01 | 0 |
○Loop 119 - csr_matvec.c:194-195 - libseq_mv.so | 0.02 | 0.01 |
○Loop 118 - csr_matvec.c:194-195 - libseq_mv.so | 0 | 0 |
▼Loop 115 - csr_matvec.c:178-204 - libseq_mv.so– | 0 | 0 |
○Loop 117 - csr_matvec.c:202-203 - libseq_mv.so | 0 | 0 |
○Loop 116 - csr_matvec.c:202-203 - libseq_mv.so | 0 | 0 |