Function: hypre_CSRMatrixMatvecOutOfPlace.extracted.19 | Module: exec | Source: csr_matvec.c:178-204 [...] | Coverage: 0.02% |
---|
Function: hypre_CSRMatrixMatvecOutOfPlace.extracted.19 | Module: exec | Source: csr_matvec.c:178-204 [...] | Coverage: 0.02% |
---|
/scratch_na/users/xoserete/qaas_runs/171-172-8217/intel/AMG/build/AMG/AMG/seq_mv/csr_matvec.c: 178 - 204 |
-------------------------------------------------------------------------------- |
178: #pragma omp parallel for private(i,j,jj,m,tempx) HYPRE_SMP_SCHEDULE |
179: #endif |
180: |
181: for (i = 0; i < num_rownnz; i++) |
182: { |
183: m = A_rownnz[i]; |
[...] |
191: if ( num_vectors==1 ) |
192: { |
193: tempx = 0; |
194: for (jj = A_i[m]; jj < A_i[m+1]; jj++) |
195: tempx += A_data[jj] * x_data[A_j[jj]]; |
196: y_data[m] += tempx; |
197: } |
198: else |
199: for ( j=0; j<num_vectors; ++j ) |
200: { |
201: tempx = 0; |
202: for (jj = A_i[m]; jj < A_i[m+1]; jj++) |
203: tempx += A_data[jj] * x_data[ j*vecstride_x + A_j[jj]*idxstride_x ]; |
204: y_data[ j*vecstride_y + m*idxstride_y] += tempx; |
0x4ce5d0 PUSH %RBP |
0x4ce5d1 MOV %RSP,%RBP |
0x4ce5d4 PUSH %R15 |
0x4ce5d6 PUSH %R14 |
0x4ce5d8 PUSH %R13 |
0x4ce5da PUSH %R12 |
0x4ce5dc PUSH %RBX |
0x4ce5dd SUB $0x98,%RSP |
0x4ce5e4 MOV 0x40(%RBP),%RAX |
0x4ce5e8 MOV %RAX,-0x68(%RBP) |
0x4ce5ec MOV 0x38(%RBP),%R12 |
0x4ce5f0 MOV 0x30(%RBP),%RAX |
0x4ce5f4 MOV %RAX,-0xb8(%RBP) |
0x4ce5fb MOV 0x28(%RBP),%RAX |
0x4ce5ff MOV %RAX,-0x78(%RBP) |
0x4ce603 MOV 0x20(%RBP),%RAX |
0x4ce607 MOV %RAX,-0x48(%RBP) |
0x4ce60b MOV 0x18(%RBP),%RAX |
0x4ce60f MOV %RAX,-0x40(%RBP) |
0x4ce613 MOV 0x10(%RBP),%RAX |
0x4ce617 MOV %RAX,-0x60(%RBP) |
0x4ce61b MOVL $0,-0x30(%RBP) |
0x4ce622 MOV %R9,-0x90(%RBP) |
0x4ce629 MOV %R8,-0x58(%RBP) |
0x4ce62d MOV %RCX,-0x38(%RBP) |
0x4ce631 MOV %RDX,-0x50(%RBP) |
0x4ce635 MOV (%RDI),%ESI |
0x4ce637 MOVQ $0,-0x70(%RBP) |
0x4ce63f MOVQ $0x1,-0xc0(%RBP) |
0x4ce64a SUB $0x8,%RSP |
0x4ce64e LEA -0xc0(%RBP),%RAX |
0x4ce655 LEA -0x30(%RBP),%RCX |
0x4ce659 LEA -0x70(%RBP),%R8 |
0x4ce65d LEA 0x50(%RBP),%R9 |
0x4ce661 MOV $0x719db0,%EDI |
0x4ce666 MOV %ESI,-0x2c(%RBP) |
0x4ce669 MOV $0x22,%EDX |
0x4ce66e PUSH $0x1 |
0x4ce670 PUSH $0x1 |
0x4ce672 PUSH %RAX |
0x4ce673 CALL 40fd90 <__kmpc_for_static_init_8@plt> |
0x4ce678 ADD $0x20,%RSP |
0x4ce67c MOV -0x70(%RBP),%RAX |
0x4ce680 MOV 0x50(%RBP),%RCX |
0x4ce684 MOV %RAX,-0x88(%RBP) |
0x4ce68b SUB %RAX,%RCX |
0x4ce68e MOV %RCX,-0x80(%RBP) |
0x4ce692 JAE 4ce6b5 |
0x4ce694 MOV $0x719dd0,%EDI |
0x4ce699 MOV -0x2c(%RBP),%ESI |
0x4ce69c ADD $0x98,%RSP |
0x4ce6a3 POP %RBX |
0x4ce6a4 POP %R12 |
0x4ce6a6 POP %R13 |
0x4ce6a8 POP %R14 |
0x4ce6aa POP %R15 |
0x4ce6ac POP %RBP |
0x4ce6ad VZEROUPPER |
0x4ce6b0 JMP 40fae0 |
0x4ce6b5 MOV -0x48(%RBP),%RAX |
0x4ce6b9 DEC %RAX |
0x4ce6bc MOV %RAX,-0xb0(%RBP) |
0x4ce6c3 VPBROADCASTQ %R12,%YMM0 |
0x4ce6c9 XOR %EDX,%EDX |
0x4ce6cb JMP 4ce6f2 |
0x4ce6cd NOPL (%RAX) |
(4069) 0x4ce6d0 MOV -0x40(%RBP),%RAX |
(4069) 0x4ce6d4 MOV (%RAX),%RAX |
(4069) 0x4ce6d7 VADDSD (%RAX,%RDI,8),%XMM1,%XMM1 |
(4069) 0x4ce6dc VMOVSD %XMM1,(%RAX,%RDI,8) |
(4069) 0x4ce6e1 MOV -0x98(%RBP),%RCX |
(4069) 0x4ce6e8 LEA 0x1(%RCX),%RDX |
(4069) 0x4ce6ec CMP -0x80(%RBP),%RCX |
(4069) 0x4ce6f0 JE 4ce694 |
(4069) 0x4ce6f2 MOV -0x88(%RBP),%RAX |
(4069) 0x4ce6f9 ADD %RDX,%RAX |
(4069) 0x4ce6fc MOV -0x90(%RBP),%RCX |
(4069) 0x4ce703 MOV (%RCX,%RAX,8),%RDI |
(4069) 0x4ce707 CMPQ $0x1,-0x48(%RBP) |
(4069) 0x4ce70c MOV %RDX,-0x98(%RBP) |
(4069) 0x4ce713 JNE 4ce7c0 |
(4069) 0x4ce719 MOV -0x38(%RBP),%RAX |
(4069) 0x4ce71d MOV (%RAX),%RAX |
(4069) 0x4ce720 MOV (%RAX,%RDI,8),%R10 |
(4069) 0x4ce724 MOV 0x8(%RAX,%RDI,8),%RAX |
(4069) 0x4ce729 VXORPD %XMM1,%XMM1,%XMM1 |
(4069) 0x4ce72d MOV %RAX,%R11 |
(4069) 0x4ce730 SUB %R10,%R11 |
(4069) 0x4ce733 JLE 4ce6d0 |
(4069) 0x4ce735 MOV -0x50(%RBP),%RCX |
(4069) 0x4ce739 MOV (%RCX),%RCX |
(4069) 0x4ce73c MOV -0x60(%RBP),%RDX |
(4069) 0x4ce740 MOV (%RDX),%RSI |
(4069) 0x4ce743 MOV -0x58(%RBP),%RDX |
(4069) 0x4ce747 MOV (%RDX),%R8 |
(4069) 0x4ce74a MOV %R11,%R9 |
(4069) 0x4ce74d AND $-0x4,%R9 |
(4069) 0x4ce751 JE 4ce924 |
(4069) 0x4ce757 LEA -0x1(%R9),%RDX |
(4069) 0x4ce75b LEA (%RCX,%R10,8),%RBX |
(4069) 0x4ce75f LEA (%R8,%R10,8),%R14 |
(4069) 0x4ce763 VXORPD %XMM1,%XMM1,%XMM1 |
(4069) 0x4ce767 XOR %R15D,%R15D |
(4069) 0x4ce76a NOPW (%RAX,%RAX,1) |
(4074) 0x4ce770 VMOVUPD (%R14,%R15,8),%YMM2 |
(4074) 0x4ce776 VXORPD %XMM3,%XMM3,%XMM3 |
(4074) 0x4ce77a KXNORW %K0,%K0,%K1 |
(4074) 0x4ce77e VGATHERQPD (%RSI,%YMM2,8),%YMM3{%K1} |
(4074) 0x4ce785 VFMADD231PD (%RBX,%R15,8),%YMM3,%YMM1 |
(4074) 0x4ce78b ADD $0x4,%R15 |
(4074) 0x4ce78f CMP %RDX,%R15 |
(4074) 0x4ce792 JBE 4ce770 |
(4069) 0x4ce794 VEXTRACTF128 $0x1,%YMM1,%XMM2 |
(4069) 0x4ce79a VADDPD %XMM2,%XMM1,%XMM1 |
(4069) 0x4ce79e VSHUFPD $0x1,%XMM1,%XMM1,%XMM2 |
(4069) 0x4ce7a3 VADDSD %XMM2,%XMM1,%XMM1 |
(4069) 0x4ce7a7 CMP %R9,%R11 |
(4069) 0x4ce7aa JE 4ce6d0 |
(4069) 0x4ce7b0 JMP 4ce927 |
0x4ce7b5 NOPW %CS:(%RAX,%RAX,1) |
(4069) 0x4ce7c0 JL 4ce6e1 |
(4069) 0x4ce7c6 MOV -0x38(%RBP),%RAX |
(4069) 0x4ce7ca MOV (%RAX),%RAX |
(4069) 0x4ce7cd MOV (%RAX,%RDI,8),%R8 |
(4069) 0x4ce7d1 MOV 0x8(%RAX,%RDI,8),%R9 |
(4069) 0x4ce7d6 MOV -0x40(%RBP),%RAX |
(4069) 0x4ce7da MOV (%RAX),%R10 |
(4069) 0x4ce7dd MOV %R9,%R14 |
(4069) 0x4ce7e0 SUB %R8,%R14 |
(4069) 0x4ce7e3 MOV %R14,-0xa8(%RBP) |
(4069) 0x4ce7ea AND $-0x4,%R14 |
(4069) 0x4ce7ee LEA -0x1(%R14),%RCX |
(4069) 0x4ce7f2 IMUL -0x78(%RBP),%RDI |
(4069) 0x4ce7f7 LEA (,%R8,8),%RAX |
(4069) 0x4ce7ff MOV %RAX,-0xa0(%RBP) |
(4069) 0x4ce806 XOR %EDX,%EDX |
(4069) 0x4ce808 JMP 4ce83e |
0x4ce80a NOPW (%RAX,%RAX,1) |
(4070) 0x4ce810 MOV %RDX,%RAX |
(4070) 0x4ce813 IMUL -0xb8(%RBP),%RAX |
(4070) 0x4ce81b ADD %RDI,%RAX |
(4070) 0x4ce81e VADDSD (%R10,%RAX,8),%XMM1,%XMM1 |
(4070) 0x4ce824 VMOVSD %XMM1,(%R10,%RAX,8) |
(4070) 0x4ce82a LEA 0x1(%RDX),%RAX |
(4070) 0x4ce82e CMP -0xb0(%RBP),%RDX |
(4070) 0x4ce835 MOV %RAX,%RDX |
(4070) 0x4ce838 JE 4ce6e1 |
(4070) 0x4ce83e VXORPD %XMM1,%XMM1,%XMM1 |
(4070) 0x4ce842 CMP %R8,%R9 |
(4070) 0x4ce845 JLE 4ce810 |
(4070) 0x4ce847 MOV -0x50(%RBP),%RAX |
(4070) 0x4ce84b MOV (%RAX),%RBX |
(4070) 0x4ce84e MOV -0x60(%RBP),%RAX |
(4070) 0x4ce852 MOV (%RAX),%RSI |
(4070) 0x4ce855 MOV -0x58(%RBP),%RAX |
(4070) 0x4ce859 MOV (%RAX),%RAX |
(4070) 0x4ce85c TEST %R14,%R14 |
(4070) 0x4ce85f JE 4ce8e3 |
(4070) 0x4ce865 MOV %RDX,%R11 |
(4070) 0x4ce868 IMUL -0x68(%RBP),%R11 |
(4070) 0x4ce86d VPBROADCASTQ %R11,%YMM2 |
(4070) 0x4ce873 MOV -0xa0(%RBP),%R15 |
(4070) 0x4ce87a LEA (%RBX,%R15,1),%R11 |
(4070) 0x4ce87e LEA (%RAX,%R15,1),%R13 |
(4070) 0x4ce882 VXORPD %XMM1,%XMM1,%XMM1 |
(4070) 0x4ce886 XOR %R15D,%R15D |
(4070) 0x4ce889 NOPL (%RAX) |
(4072) 0x4ce890 VXORPS %XMM3,%XMM3,%XMM3 |
(4072) 0x4ce894 VPMULLQ (%R13,%R15,8),%YMM0,%YMM3 |
(4072) 0x4ce89c VPADDQ %YMM2,%YMM3,%YMM3 |
(4072) 0x4ce8a0 KXNORW %K0,%K0,%K1 |
(4072) 0x4ce8a4 VXORPD %XMM4,%XMM4,%XMM4 |
(4072) 0x4ce8a8 VGATHERQPD (%RSI,%YMM3,8),%YMM4{%K1} |
(4072) 0x4ce8af VFMADD231PD (%R11,%R15,8),%YMM4,%YMM1 |
(4072) 0x4ce8b5 ADD $0x4,%R15 |
(4072) 0x4ce8b9 CMP %RCX,%R15 |
(4072) 0x4ce8bc JBE 4ce890 |
(4070) 0x4ce8be VEXTRACTF128 $0x1,%YMM1,%XMM2 |
(4070) 0x4ce8c4 VADDPD %XMM2,%XMM1,%XMM1 |
(4070) 0x4ce8c8 VSHUFPD $0x1,%XMM1,%XMM1,%XMM2 |
(4070) 0x4ce8cd VADDSD %XMM2,%XMM1,%XMM1 |
(4070) 0x4ce8d1 MOV %R14,%R11 |
(4070) 0x4ce8d4 CMP %R14,-0xa8(%RBP) |
(4070) 0x4ce8db JE 4ce810 |
(4070) 0x4ce8e1 JMP 4ce8e6 |
(4070) 0x4ce8e3 XOR %R11D,%R11D |
(4070) 0x4ce8e6 MOV %RDX,%R13 |
(4070) 0x4ce8e9 IMUL -0x68(%RBP),%R13 |
(4070) 0x4ce8ee ADD %R8,%R11 |
(4070) 0x4ce8f1 NOPW %CS:(%RAX,%RAX,1) |
(4071) 0x4ce900 MOV (%RAX,%R11,8),%R15 |
(4071) 0x4ce904 IMUL %R12,%R15 |
(4071) 0x4ce908 ADD %R13,%R15 |
(4071) 0x4ce90b VMOVSD (%RSI,%R15,8),%XMM2 |
(4071) 0x4ce911 VFMADD231SD (%RBX,%R11,8),%XMM2,%XMM1 |
(4071) 0x4ce917 INC %R11 |
(4071) 0x4ce91a CMP %R11,%R9 |
(4071) 0x4ce91d JNE 4ce900 |
(4070) 0x4ce91f JMP 4ce810 |
(4069) 0x4ce924 XOR %R9D,%R9D |
(4069) 0x4ce927 ADD %R10,%R9 |
(4069) 0x4ce92a NOPW (%RAX,%RAX,1) |
(4073) 0x4ce930 MOV (%R8,%R9,8),%RDX |
(4073) 0x4ce934 VMOVSD (%RSI,%RDX,8),%XMM2 |
(4073) 0x4ce939 VFMADD231SD (%RCX,%R9,8),%XMM2,%XMM1 |
(4073) 0x4ce93f INC %R9 |
(4073) 0x4ce942 CMP %R9,%RAX |
(4073) 0x4ce945 JNE 4ce930 |
(4069) 0x4ce947 JMP 4ce6d0 |
0x4ce94c NOPL (%RAX) |
Path / |
Source file and lines | csr_matvec.c:178-204 |
Module | exec |
nb instructions | 69 |
nb uops | 71 |
loop length | 277 |
used x86 registers | 14 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 1 |
used zmm registers | 0 |
nb stack references | 25 |
micro-operation queue | 11.83 cycles |
front end | 11.83 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 1.70 | 1.60 | 6.00 | 6.00 | 13.50 | 1.60 | 1.50 | 13.50 | 13.50 | 13.50 | 1.60 | 6.00 |
cycles | 1.70 | 1.60 | 6.00 | 6.00 | 13.50 | 1.60 | 1.50 | 13.50 | 13.50 | 13.50 | 1.60 | 6.00 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 13.59 |
Stall cycles | 1.64-1.64 |
RS full (events) | 5.17-4.80 |
Front-end | 11.83 |
Dispatch | 13.50 |
Overall L1 | 13.50 |
all | 3% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 25% |
all | 11% |
load | 9% |
store | 11% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R15 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R14 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R13 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R12 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RBX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
SUB $0x98,%RSP | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV 0x40(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x68(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x38(%RBP),%R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x30(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0xb8(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x28(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x78(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x20(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x48(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x18(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x40(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x10(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x60(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOVL $0,-0x30(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R9,-0x90(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R8,-0x58(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RCX,-0x38(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RDX,-0x50(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV (%RDI),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVQ $0,-0x70(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOVQ $0x1,-0xc0(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
SUB $0x8,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
LEA -0xc0(%RBP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA -0x30(%RBP),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA -0x70(%RBP),%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA 0x50(%RBP),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV $0x719db0,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %ESI,-0x2c(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV $0x22,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
PUSH $0x1 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH $0x1 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RAX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
CALL 40fd90 <__kmpc_for_static_init_8@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
ADD $0x20,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV -0x70(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x50(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x88(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
SUB %RAX,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %RCX,-0x80(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
JAE 4ce6b5 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0xe5> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV $0x719dd0,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV -0x2c(%RBP),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD $0x98,%RSP | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
POP %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
VZEROUPPER | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
JMP 40fae0 <__kmpc_for_static_fini@plt> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
MOV -0x48(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
DEC %RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV %RAX,-0xb0(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VPBROADCASTQ %R12,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 4ce6f2 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x122> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
Source file and lines | csr_matvec.c:178-204 |
Module | exec |
nb instructions | 69 |
nb uops | 71 |
loop length | 277 |
used x86 registers | 14 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 1 |
used zmm registers | 0 |
nb stack references | 25 |
micro-operation queue | 11.83 cycles |
front end | 11.83 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 1.70 | 1.60 | 6.00 | 6.00 | 13.50 | 1.60 | 1.50 | 13.50 | 13.50 | 13.50 | 1.60 | 6.00 |
cycles | 1.70 | 1.60 | 6.00 | 6.00 | 13.50 | 1.60 | 1.50 | 13.50 | 13.50 | 13.50 | 1.60 | 6.00 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 13.59 |
Stall cycles | 1.64-1.64 |
RS full (events) | 5.17-4.80 |
Front-end | 11.83 |
Dispatch | 13.50 |
Overall L1 | 13.50 |
all | 3% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 25% |
all | 11% |
load | 9% |
store | 11% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R15 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R14 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R13 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R12 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RBX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
SUB $0x98,%RSP | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV 0x40(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x68(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x38(%RBP),%R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x30(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0xb8(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x28(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x78(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x20(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x48(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x18(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x40(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x10(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x60(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOVL $0,-0x30(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R9,-0x90(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R8,-0x58(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RCX,-0x38(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RDX,-0x50(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV (%RDI),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVQ $0,-0x70(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOVQ $0x1,-0xc0(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
SUB $0x8,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
LEA -0xc0(%RBP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA -0x30(%RBP),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA -0x70(%RBP),%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA 0x50(%RBP),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV $0x719db0,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %ESI,-0x2c(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV $0x22,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
PUSH $0x1 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH $0x1 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RAX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
CALL 40fd90 <__kmpc_for_static_init_8@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
ADD $0x20,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV -0x70(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x50(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x88(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
SUB %RAX,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %RCX,-0x80(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
JAE 4ce6b5 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0xe5> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV $0x719dd0,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV -0x2c(%RBP),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD $0x98,%RSP | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
POP %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
VZEROUPPER | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
JMP 40fae0 <__kmpc_for_static_fini@plt> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
MOV -0x48(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
DEC %RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV %RAX,-0xb0(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VPBROADCASTQ %R12,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 4ce6f2 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x122> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
Name | Coverage (%) | Time (s) |
---|---|---|
▼hypre_CSRMatrixMatvecOutOfPlace.extracted.19– | 0.02 | 0 |
▼Loop 4069 - csr_matvec.c:178-204 - exec– | 0 | 0 |
○Loop 4074 - csr_matvec.c:194-195 - exec | 0.01 | 0 |
○Loop 4073 - csr_matvec.c:194-195 - exec | 0 | 0 |
▼Loop 4070 - csr_matvec.c:199-204 - exec– | 0 | 0 |
○Loop 4072 - csr_matvec.c:202-203 - exec | 0 | 0 |
○Loop 4071 - csr_matvec.c:202-203 - exec | 0 | 0 |