Function: hypre_CSRMatrixMatvecOutOfPlace.extracted.19 | Module: exec | Source: csr_matvec.c:178-204 [...] | Coverage: 0.03% |
---|
Function: hypre_CSRMatrixMatvecOutOfPlace.extracted.19 | Module: exec | Source: csr_matvec.c:178-204 [...] | Coverage: 0.03% |
---|
/scratch_na/users/xoserete/qaas_runs/171-172-8218/intel/AMG/build/AMG/AMG/seq_mv/csr_matvec.c: 178 - 204 |
-------------------------------------------------------------------------------- |
178: #pragma omp parallel for private(i,j,jj,m,tempx) HYPRE_SMP_SCHEDULE |
179: #endif |
180: |
181: for (i = 0; i < num_rownnz; i++) |
182: { |
183: m = A_rownnz[i]; |
[...] |
191: if ( num_vectors==1 ) |
192: { |
193: tempx = 0; |
194: for (jj = A_i[m]; jj < A_i[m+1]; jj++) |
195: tempx += A_data[jj] * x_data[A_j[jj]]; |
196: y_data[m] += tempx; |
197: } |
198: else |
199: for ( j=0; j<num_vectors; ++j ) |
200: { |
201: tempx = 0; |
202: for (jj = A_i[m]; jj < A_i[m+1]; jj++) |
203: tempx += A_data[jj] * x_data[ j*vecstride_x + A_j[jj]*idxstride_x ]; |
204: y_data[ j*vecstride_y + m*idxstride_y] += tempx; |
0x4decd0 PUSH %RBP |
0x4decd1 MOV %RSP,%RBP |
0x4decd4 PUSH %R15 |
0x4decd6 PUSH %R14 |
0x4decd8 PUSH %R13 |
0x4decda PUSH %R12 |
0x4decdc PUSH %RBX |
0x4decdd SUB $0x78,%RSP |
0x4dece1 MOV 0x40(%RBP),%RAX |
0x4dece5 MOV %RAX,-0x70(%RBP) |
0x4dece9 MOV 0x38(%RBP),%R12 |
0x4deced MOV 0x30(%RBP),%RAX |
0x4decf1 MOV %RAX,-0x98(%RBP) |
0x4decf8 MOV 0x28(%RBP),%RAX |
0x4decfc MOV %RAX,-0x80(%RBP) |
0x4ded00 MOV 0x20(%RBP),%RBX |
0x4ded04 MOV 0x18(%RBP),%RAX |
0x4ded08 MOV %RAX,-0x68(%RBP) |
0x4ded0c MOV 0x10(%RBP),%R14 |
0x4ded10 MOVL $0,-0x30(%RBP) |
0x4ded17 MOV %R9,-0x50(%RBP) |
0x4ded1b MOV %R8,%R15 |
0x4ded1e MOV %RCX,-0x48(%RBP) |
0x4ded22 MOV %RDX,%R13 |
0x4ded25 MOV (%RDI),%ESI |
0x4ded27 MOVQ $0,-0x78(%RBP) |
0x4ded2f MOVQ $0x1,-0xa0(%RBP) |
0x4ded3a SUB $0x8,%RSP |
0x4ded3e LEA -0xa0(%RBP),%RAX |
0x4ded45 LEA -0x30(%RBP),%RCX |
0x4ded49 LEA -0x78(%RBP),%R8 |
0x4ded4d LEA 0x50(%RBP),%R9 |
0x4ded51 MOV $0x736d70,%EDI |
0x4ded56 MOV %ESI,-0x2c(%RBP) |
0x4ded59 MOV $0x22,%EDX |
0x4ded5e PUSH $0x1 |
0x4ded60 PUSH $0x1 |
0x4ded62 PUSH %RAX |
0x4ded63 CALL 40fee0 <__kmpc_for_static_init_8@plt> |
0x4ded68 ADD $0x20,%RSP |
0x4ded6c MOV -0x78(%RBP),%RAX |
0x4ded70 MOV 0x50(%RBP),%RCX |
0x4ded74 MOV %RAX,-0x40(%RBP) |
0x4ded78 SUB %RAX,%RCX |
0x4ded7b MOV %RCX,-0x38(%RBP) |
0x4ded7f JAE 4ded9f |
0x4ded81 MOV $0x736d90,%EDI |
0x4ded86 MOV -0x2c(%RBP),%ESI |
0x4ded89 ADD $0x78,%RSP |
0x4ded8d POP %RBX |
0x4ded8e POP %R12 |
0x4ded90 POP %R13 |
0x4ded92 POP %R14 |
0x4ded94 POP %R15 |
0x4ded96 POP %RBP |
0x4ded97 VZEROUPPER |
0x4ded9a JMP 40fc30 |
0x4ded9f CMP $0x1,%RBX |
0x4deda3 JNE 4dee7e |
0x4deda9 XOR %EDX,%EDX |
0x4dedab JMP 4dedcb |
0x4dedad NOPL (%RAX) |
(4325) 0x4dedb0 MOV -0x68(%RBP),%RCX |
(4325) 0x4dedb4 VADDSD (%RCX,%RAX,8),%XMM0,%XMM0 |
(4325) 0x4dedb9 VMOVSD %XMM0,(%RCX,%RAX,8) |
(4325) 0x4dedbe LEA 0x1(%RDX),%RAX |
(4325) 0x4dedc2 CMP -0x38(%RBP),%RDX |
(4325) 0x4dedc6 MOV %RAX,%RDX |
(4325) 0x4dedc9 JE 4ded81 |
(4325) 0x4dedcb MOV -0x40(%RBP),%RAX |
(4325) 0x4dedcf ADD %RDX,%RAX |
(4325) 0x4dedd2 MOV -0x50(%RBP),%RCX |
(4325) 0x4dedd6 MOV (%RCX,%RAX,8),%RAX |
(4325) 0x4dedda MOV -0x48(%RBP),%RCX |
(4325) 0x4dedde MOV (%RCX,%RAX,8),%RDI |
(4325) 0x4dede2 MOV 0x8(%RCX,%RAX,8),%RCX |
(4325) 0x4dede7 VXORPD %XMM0,%XMM0,%XMM0 |
(4325) 0x4dedeb MOV %RCX,%R8 |
(4325) 0x4dedee SUB %RDI,%R8 |
(4325) 0x4dedf1 JLE 4dedb0 |
(4325) 0x4dedf3 MOV %R8,%RSI |
(4325) 0x4dedf6 AND $-0x4,%RSI |
(4325) 0x4dedfa JE 4dee52 |
(4325) 0x4dedfc LEA -0x1(%RSI),%R9 |
(4325) 0x4dee00 LEA (%R13,%RDI,8),%R10 |
(4325) 0x4dee05 LEA (%R15,%RDI,8),%R11 |
(4325) 0x4dee09 VXORPD %XMM0,%XMM0,%XMM0 |
(4325) 0x4dee0d XOR %R12D,%R12D |
(4327) 0x4dee10 VMOVUPD (%R11,%R12,8),%YMM1 |
(4327) 0x4dee16 VXORPD %XMM2,%XMM2,%XMM2 |
(4327) 0x4dee1a KXNORW %K0,%K0,%K1 |
(4327) 0x4dee1e VGATHERQPD (%R14,%YMM1,8),%YMM2{%K1} |
(4327) 0x4dee25 VFMADD231PD (%R10,%R12,8),%YMM2,%YMM0 |
(4327) 0x4dee2b ADD $0x4,%R12 |
(4327) 0x4dee2f CMP %R9,%R12 |
(4327) 0x4dee32 JBE 4dee10 |
(4325) 0x4dee34 VEXTRACTF128 $0x1,%YMM0,%XMM1 |
(4325) 0x4dee3a VADDPD %XMM1,%XMM0,%XMM0 |
(4325) 0x4dee3e VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 |
(4325) 0x4dee43 VADDSD %XMM1,%XMM0,%XMM0 |
(4325) 0x4dee47 CMP %RSI,%R8 |
(4325) 0x4dee4a JE 4dedb0 |
(4325) 0x4dee50 JMP 4dee54 |
(4325) 0x4dee52 XOR %ESI,%ESI |
(4325) 0x4dee54 ADD %RDI,%RSI |
(4325) 0x4dee57 NOPW (%RAX,%RAX,1) |
(4326) 0x4dee60 MOV (%R15,%RSI,8),%RDI |
(4326) 0x4dee64 VMOVSD (%R14,%RDI,8),%XMM1 |
(4326) 0x4dee6a VFMADD231SD (%R13,%RSI,8),%XMM1,%XMM0 |
(4326) 0x4dee71 INC %RSI |
(4326) 0x4dee74 CMP %RSI,%RCX |
(4326) 0x4dee77 JNE 4dee60 |
(4325) 0x4dee79 JMP 4dedb0 |
0x4dee7e JL 4ded81 |
0x4dee84 DEC %RBX |
0x4dee87 MOV %RBX,-0x90(%RBP) |
0x4dee8e VPBROADCASTQ %R12,%YMM0 |
0x4dee94 XOR %ECX,%ECX |
0x4dee96 JMP 4deeb8 |
0x4dee98 NOPL (%RAX,%RAX,1) |
(4321) 0x4deea0 MOV -0x88(%RBP),%RCX |
(4321) 0x4deea7 LEA 0x1(%RCX),%RAX |
(4321) 0x4deeab CMP -0x38(%RBP),%RCX |
(4321) 0x4deeaf MOV %RAX,%RCX |
(4321) 0x4deeb2 JE 4ded81 |
(4321) 0x4deeb8 MOV -0x40(%RBP),%RAX |
(4321) 0x4deebc MOV %RCX,-0x88(%RBP) |
(4321) 0x4deec3 ADD %RCX,%RAX |
(4321) 0x4deec6 MOV -0x50(%RBP),%RCX |
(4321) 0x4deeca MOV (%RCX,%RAX,8),%RSI |
(4321) 0x4deece MOV -0x48(%RBP),%RAX |
(4321) 0x4deed2 MOV (%RAX,%RSI,8),%RCX |
(4321) 0x4deed6 MOV 0x8(%RAX,%RSI,8),%R8 |
(4321) 0x4deedb MOV %R8,%RAX |
(4321) 0x4deede MOV %RCX,-0x60(%RBP) |
(4321) 0x4deee2 SUB %RCX,%RAX |
(4321) 0x4deee5 MOV %RAX,-0x58(%RBP) |
(4321) 0x4deee9 JLE 4deea0 |
(4321) 0x4deeeb MOV -0x58(%RBP),%R9 |
(4321) 0x4deeef AND $-0x4,%R9 |
(4321) 0x4deef3 LEA -0x1(%R9),%R11 |
(4321) 0x4deef7 IMUL -0x80(%RBP),%RSI |
(4321) 0x4deefc MOV -0x60(%RBP),%RCX |
(4321) 0x4def00 LEA (%R13,%RCX,8),%RAX |
(4321) 0x4def05 LEA (%R15,%RCX,8),%RCX |
(4321) 0x4def09 XOR %EDI,%EDI |
(4321) 0x4def0b JMP 4def45 |
0x4def0d NOPL (%RAX) |
(4323) 0x4def10 MOV %RDI,%RDX |
(4323) 0x4def13 IMUL -0x98(%RBP),%RDX |
(4323) 0x4def1b ADD %RSI,%RDX |
(4323) 0x4def1e MOV -0x68(%RBP),%R10 |
(4323) 0x4def22 VADDSD (%R10,%RDX,8),%XMM1,%XMM1 |
(4323) 0x4def28 VMOVSD %XMM1,(%R10,%RDX,8) |
(4323) 0x4def2e LEA 0x1(%RDI),%RDX |
(4323) 0x4def32 MOV -0x90(%RBP),%R10 |
(4323) 0x4def39 CMP %R10,%RDI |
(4323) 0x4def3c MOV %RDX,%RDI |
(4323) 0x4def3f JE 4deea0 |
(4323) 0x4def45 TEST %R9,%R9 |
(4323) 0x4def48 JE 4defb0 |
(4323) 0x4def4a MOV %RDI,%RDX |
(4323) 0x4def4d IMUL -0x70(%RBP),%RDX |
(4323) 0x4def52 VPBROADCASTQ %RDX,%YMM2 |
(4323) 0x4def58 VXORPD %XMM1,%XMM1,%XMM1 |
(4323) 0x4def5c XOR %EDX,%EDX |
(4323) 0x4def5e XCHG %AX,%AX |
(4324) 0x4def60 VXORPS %XMM3,%XMM3,%XMM3 |
(4324) 0x4def64 VPMULLQ (%RCX,%RDX,8),%YMM0,%YMM3 |
(4324) 0x4def6b VPADDQ %YMM2,%YMM3,%YMM3 |
(4324) 0x4def6f KXNORW %K0,%K0,%K1 |
(4324) 0x4def73 VXORPD %XMM4,%XMM4,%XMM4 |
(4324) 0x4def77 VGATHERQPD (%R14,%YMM3,8),%YMM4{%K1} |
(4324) 0x4def7e VFMADD231PD (%RAX,%RDX,8),%YMM4,%YMM1 |
(4324) 0x4def84 ADD $0x4,%RDX |
(4324) 0x4def88 CMP %R11,%RDX |
(4324) 0x4def8b JBE 4def60 |
(4323) 0x4def8d VEXTRACTF128 $0x1,%YMM1,%XMM2 |
(4323) 0x4def93 VADDPD %XMM2,%XMM1,%XMM1 |
(4323) 0x4def97 VSHUFPD $0x1,%XMM1,%XMM1,%XMM2 |
(4323) 0x4def9c VADDSD %XMM2,%XMM1,%XMM1 |
(4323) 0x4defa0 MOV %R9,%R10 |
(4323) 0x4defa3 CMP %R9,-0x58(%RBP) |
(4323) 0x4defa7 JE 4def10 |
(4323) 0x4defad JMP 4defb7 |
0x4defaf NOP |
(4323) 0x4defb0 VXORPD %XMM1,%XMM1,%XMM1 |
(4323) 0x4defb4 XOR %R10D,%R10D |
(4323) 0x4defb7 MOV %RDI,%RDX |
(4323) 0x4defba IMUL -0x70(%RBP),%RDX |
(4323) 0x4defbf ADD -0x60(%RBP),%R10 |
(4323) 0x4defc3 NOPW %CS:(%RAX,%RAX,1) |
(4322) 0x4defd0 MOV (%R15,%R10,8),%RBX |
(4322) 0x4defd4 IMUL %R12,%RBX |
(4322) 0x4defd8 ADD %RDX,%RBX |
(4322) 0x4defdb VMOVSD (%R14,%RBX,8),%XMM2 |
(4322) 0x4defe1 VFMADD231SD (%R13,%R10,8),%XMM2,%XMM1 |
(4322) 0x4defe8 INC %R10 |
(4322) 0x4defeb CMP %R10,%R8 |
(4322) 0x4defee JNE 4defd0 |
(4323) 0x4deff0 JMP 4def10 |
0x4deff5 NOPW %CS:(%RAX,%RAX,1) |
Path / |
Source file and lines | csr_matvec.c:178-204 |
Module | exec |
nb instructions | 72 |
nb uops | 74 |
loop length | 273 |
used x86 registers | 14 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 1 |
used zmm registers | 0 |
nb stack references | 21 |
micro-operation queue | 12.33 cycles |
front end | 12.33 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 1.90 | 1.80 | 5.67 | 5.67 | 11.50 | 1.80 | 1.70 | 11.50 | 11.50 | 11.50 | 1.80 | 5.67 |
cycles | 1.90 | 1.80 | 5.67 | 5.67 | 11.50 | 1.80 | 1.70 | 11.50 | 11.50 | 11.50 | 1.80 | 5.67 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 12.11 |
Stall cycles | 0.00 |
Front-end | 12.33 |
Dispatch | 11.50 |
Overall L1 | 12.33 |
all | 3% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
all | 11% |
load | 10% |
store | 10% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 11% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R15 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R14 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R13 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R12 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RBX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
SUB $0x78,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV 0x40(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x70(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x38(%RBP),%R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x30(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x98(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x28(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x80(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x20(%RBP),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x18(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x68(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x10(%RBP),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVL $0,-0x30(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R9,-0x50(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R8,%R15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV %RCX,-0x48(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RDX,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV (%RDI),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVQ $0,-0x78(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOVQ $0x1,-0xa0(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
SUB $0x8,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
LEA -0xa0(%RBP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA -0x30(%RBP),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA -0x78(%RBP),%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA 0x50(%RBP),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV $0x736d70,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %ESI,-0x2c(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV $0x22,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
PUSH $0x1 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH $0x1 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RAX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
CALL 40fee0 <__kmpc_for_static_init_8@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
ADD $0x20,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV -0x78(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x50(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x40(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
SUB %RAX,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %RCX,-0x38(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
JAE 4ded9f <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0xcf> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV $0x736d90,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV -0x2c(%RBP),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD $0x78,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
POP %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
VZEROUPPER | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
JMP 40fc30 <__kmpc_for_static_fini@plt> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
CMP $0x1,%RBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JNE 4dee7e <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x1ae> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 4dedcb <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0xfb> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JL 4ded81 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0xb1> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
DEC %RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV %RBX,-0x90(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VPBROADCASTQ %R12,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 4deeb8 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x1e8> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
Source file and lines | csr_matvec.c:178-204 |
Module | exec |
nb instructions | 72 |
nb uops | 74 |
loop length | 273 |
used x86 registers | 14 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 1 |
used zmm registers | 0 |
nb stack references | 21 |
micro-operation queue | 12.33 cycles |
front end | 12.33 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 1.90 | 1.80 | 5.67 | 5.67 | 11.50 | 1.80 | 1.70 | 11.50 | 11.50 | 11.50 | 1.80 | 5.67 |
cycles | 1.90 | 1.80 | 5.67 | 5.67 | 11.50 | 1.80 | 1.70 | 11.50 | 11.50 | 11.50 | 1.80 | 5.67 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 12.11 |
Stall cycles | 0.00 |
Front-end | 12.33 |
Dispatch | 11.50 |
Overall L1 | 12.33 |
all | 3% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
all | 11% |
load | 10% |
store | 10% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 11% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R15 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R14 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R13 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R12 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RBX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
SUB $0x78,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV 0x40(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x70(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x38(%RBP),%R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x30(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x98(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x28(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x80(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x20(%RBP),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x18(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x68(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x10(%RBP),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVL $0,-0x30(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R9,-0x50(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R8,%R15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV %RCX,-0x48(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RDX,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV (%RDI),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVQ $0,-0x78(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOVQ $0x1,-0xa0(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
SUB $0x8,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
LEA -0xa0(%RBP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA -0x30(%RBP),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA -0x78(%RBP),%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA 0x50(%RBP),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV $0x736d70,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %ESI,-0x2c(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV $0x22,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
PUSH $0x1 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH $0x1 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RAX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
CALL 40fee0 <__kmpc_for_static_init_8@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
ADD $0x20,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV -0x78(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x50(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x40(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
SUB %RAX,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %RCX,-0x38(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
JAE 4ded9f <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0xcf> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV $0x736d90,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV -0x2c(%RBP),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD $0x78,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
POP %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
VZEROUPPER | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
JMP 40fc30 <__kmpc_for_static_fini@plt> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
CMP $0x1,%RBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JNE 4dee7e <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x1ae> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 4dedcb <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0xfb> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JL 4ded81 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0xb1> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
DEC %RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV %RBX,-0x90(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VPBROADCASTQ %R12,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 4deeb8 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x1e8> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
Name | Coverage (%) | Time (s) |
---|---|---|
▼hypre_CSRMatrixMatvecOutOfPlace.extracted.19– | 0.03 | 0.01 |
▼Loop 4321 - csr_matvec.c:178-204 - exec– | 0 | 0 |
▼Loop 4323 - csr_matvec.c:199-204 - exec– | 0 | 0 |
○Loop 4322 - csr_matvec.c:202-203 - exec | 0 | 0 |
○Loop 4324 - csr_matvec.c:202-203 - exec | 0 | 0 |
▼Loop 4325 - csr_matvec.c:178-196 - exec– | 0 | 0 |
○Loop 4327 - csr_matvec.c:194-195 - exec | 0.02 | 0.01 |
○Loop 4326 - csr_matvec.c:194-195 - exec | 0 | 0 |