Function: hypre_CSRMatrixMatvecOutOfPlace.extracted.19 | Module: exec | Source: csr_matvec.c:178-204 [...] | Coverage: 0.02% |
---|
Function: hypre_CSRMatrixMatvecOutOfPlace.extracted.19 | Module: exec | Source: csr_matvec.c:178-204 [...] | Coverage: 0.02% |
---|
/home/eoseret/qaas_runs_CPU_9468/171-112-7443/intel/AMG/build/AMG/AMG/seq_mv/csr_matvec.c: 178 - 204 |
-------------------------------------------------------------------------------- |
178: #pragma omp parallel for private(i,j,jj,m,tempx) HYPRE_SMP_SCHEDULE |
179: #endif |
180: |
181: for (i = 0; i < num_rownnz; i++) |
182: { |
183: m = A_rownnz[i]; |
[...] |
191: if ( num_vectors==1 ) |
192: { |
193: tempx = 0; |
194: for (jj = A_i[m]; jj < A_i[m+1]; jj++) |
195: tempx += A_data[jj] * x_data[A_j[jj]]; |
196: y_data[m] += tempx; |
197: } |
198: else |
199: for ( j=0; j<num_vectors; ++j ) |
200: { |
201: tempx = 0; |
202: for (jj = A_i[m]; jj < A_i[m+1]; jj++) |
203: tempx += A_data[jj] * x_data[ j*vecstride_x + A_j[jj]*idxstride_x ]; |
204: y_data[ j*vecstride_y + m*idxstride_y] += tempx; |
0x4df210 PUSH %RBP |
0x4df211 MOV %RSP,%RBP |
0x4df214 PUSH %R15 |
0x4df216 PUSH %R14 |
0x4df218 PUSH %R13 |
0x4df21a PUSH %R12 |
0x4df21c PUSH %RBX |
0x4df21d SUB $0x78,%RSP |
0x4df221 MOV 0x40(%RBP),%RAX |
0x4df225 MOV %RAX,-0x70(%RBP) |
0x4df229 MOV 0x38(%RBP),%R12 |
0x4df22d MOV 0x30(%RBP),%RAX |
0x4df231 MOV %RAX,-0x98(%RBP) |
0x4df238 MOV 0x28(%RBP),%RAX |
0x4df23c MOV %RAX,-0x80(%RBP) |
0x4df240 MOV 0x20(%RBP),%RBX |
0x4df244 MOV 0x18(%RBP),%RAX |
0x4df248 MOV %RAX,-0x68(%RBP) |
0x4df24c MOV 0x10(%RBP),%R14 |
0x4df250 MOVL $0,-0x30(%RBP) |
0x4df257 MOV %R9,-0x50(%RBP) |
0x4df25b MOV %R8,%R15 |
0x4df25e MOV %RCX,-0x48(%RBP) |
0x4df262 MOV %RDX,%R13 |
0x4df265 MOV (%RDI),%ESI |
0x4df267 MOVQ $0,-0x78(%RBP) |
0x4df26f MOVQ $0x1,-0xa0(%RBP) |
0x4df27a SUB $0x8,%RSP |
0x4df27e LEA -0xa0(%RBP),%RAX |
0x4df285 LEA -0x30(%RBP),%RCX |
0x4df289 LEA -0x78(%RBP),%R8 |
0x4df28d LEA 0x50(%RBP),%R9 |
0x4df291 MOV $0x537d70,%EDI |
0x4df296 MOV %ESI,-0x2c(%RBP) |
0x4df299 MOV $0x22,%EDX |
0x4df29e PUSH $0x1 |
0x4df2a0 PUSH $0x1 |
0x4df2a2 PUSH %RAX |
0x4df2a3 CALL 410420 <__kmpc_for_static_init_8@plt> |
0x4df2a8 ADD $0x20,%RSP |
0x4df2ac MOV -0x78(%RBP),%RAX |
0x4df2b0 MOV 0x50(%RBP),%RCX |
0x4df2b4 MOV %RAX,-0x40(%RBP) |
0x4df2b8 SUB %RAX,%RCX |
0x4df2bb MOV %RCX,-0x38(%RBP) |
0x4df2bf JAE 4df2df |
0x4df2c1 MOV $0x537d90,%EDI |
0x4df2c6 MOV -0x2c(%RBP),%ESI |
0x4df2c9 ADD $0x78,%RSP |
0x4df2cd POP %RBX |
0x4df2ce POP %R12 |
0x4df2d0 POP %R13 |
0x4df2d2 POP %R14 |
0x4df2d4 POP %R15 |
0x4df2d6 POP %RBP |
0x4df2d7 VZEROUPPER |
0x4df2da JMP 410170 |
0x4df2df CMP $0x1,%RBX |
0x4df2e3 JNE 4df3be |
0x4df2e9 XOR %EDX,%EDX |
0x4df2eb JMP 4df30b |
0x4df2ed NOPL (%RAX) |
(4325) 0x4df2f0 MOV -0x68(%RBP),%RCX |
(4325) 0x4df2f4 VADDSD (%RCX,%RAX,8),%XMM0,%XMM0 |
(4325) 0x4df2f9 VMOVSD %XMM0,(%RCX,%RAX,8) |
(4325) 0x4df2fe LEA 0x1(%RDX),%RAX |
(4325) 0x4df302 CMP -0x38(%RBP),%RDX |
(4325) 0x4df306 MOV %RAX,%RDX |
(4325) 0x4df309 JE 4df2c1 |
(4325) 0x4df30b MOV -0x40(%RBP),%RAX |
(4325) 0x4df30f ADD %RDX,%RAX |
(4325) 0x4df312 MOV -0x50(%RBP),%RCX |
(4325) 0x4df316 MOV (%RCX,%RAX,8),%RAX |
(4325) 0x4df31a MOV -0x48(%RBP),%RCX |
(4325) 0x4df31e MOV (%RCX,%RAX,8),%RDI |
(4325) 0x4df322 MOV 0x8(%RCX,%RAX,8),%RCX |
(4325) 0x4df327 VXORPD %XMM0,%XMM0,%XMM0 |
(4325) 0x4df32b MOV %RCX,%R8 |
(4325) 0x4df32e SUB %RDI,%R8 |
(4325) 0x4df331 JLE 4df2f0 |
(4325) 0x4df333 MOV %R8,%RSI |
(4325) 0x4df336 AND $-0x4,%RSI |
(4325) 0x4df33a JE 4df392 |
(4325) 0x4df33c LEA -0x1(%RSI),%R9 |
(4325) 0x4df340 LEA (%R13,%RDI,8),%R10 |
(4325) 0x4df345 LEA (%R15,%RDI,8),%R11 |
(4325) 0x4df349 VXORPD %XMM0,%XMM0,%XMM0 |
(4325) 0x4df34d XOR %R12D,%R12D |
(4327) 0x4df350 VMOVUPD (%R11,%R12,8),%YMM1 |
(4327) 0x4df356 VXORPD %XMM2,%XMM2,%XMM2 |
(4327) 0x4df35a KXNORW %K0,%K0,%K1 |
(4327) 0x4df35e VGATHERQPD (%R14,%YMM1,8),%YMM2{%K1} |
(4327) 0x4df365 VFMADD231PD (%R10,%R12,8),%YMM2,%YMM0 |
(4327) 0x4df36b ADD $0x4,%R12 |
(4327) 0x4df36f CMP %R9,%R12 |
(4327) 0x4df372 JBE 4df350 |
(4325) 0x4df374 VEXTRACTF128 $0x1,%YMM0,%XMM1 |
(4325) 0x4df37a VADDPD %XMM1,%XMM0,%XMM0 |
(4325) 0x4df37e VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 |
(4325) 0x4df383 VADDSD %XMM1,%XMM0,%XMM0 |
(4325) 0x4df387 CMP %RSI,%R8 |
(4325) 0x4df38a JE 4df2f0 |
(4325) 0x4df390 JMP 4df394 |
(4325) 0x4df392 XOR %ESI,%ESI |
(4325) 0x4df394 ADD %RDI,%RSI |
(4325) 0x4df397 NOPW (%RAX,%RAX,1) |
(4326) 0x4df3a0 MOV (%R15,%RSI,8),%RDI |
(4326) 0x4df3a4 VMOVSD (%R14,%RDI,8),%XMM1 |
(4326) 0x4df3aa VFMADD231SD (%R13,%RSI,8),%XMM1,%XMM0 |
(4326) 0x4df3b1 INC %RSI |
(4326) 0x4df3b4 CMP %RSI,%RCX |
(4326) 0x4df3b7 JNE 4df3a0 |
(4325) 0x4df3b9 JMP 4df2f0 |
0x4df3be JL 4df2c1 |
0x4df3c4 DEC %RBX |
0x4df3c7 MOV %RBX,-0x90(%RBP) |
0x4df3ce VPBROADCASTQ %R12,%YMM0 |
0x4df3d4 XOR %ECX,%ECX |
0x4df3d6 JMP 4df3f8 |
0x4df3d8 NOPL (%RAX,%RAX,1) |
(4321) 0x4df3e0 MOV -0x88(%RBP),%RCX |
(4321) 0x4df3e7 LEA 0x1(%RCX),%RAX |
(4321) 0x4df3eb CMP -0x38(%RBP),%RCX |
(4321) 0x4df3ef MOV %RAX,%RCX |
(4321) 0x4df3f2 JE 4df2c1 |
(4321) 0x4df3f8 MOV -0x40(%RBP),%RAX |
(4321) 0x4df3fc MOV %RCX,-0x88(%RBP) |
(4321) 0x4df403 ADD %RCX,%RAX |
(4321) 0x4df406 MOV -0x50(%RBP),%RCX |
(4321) 0x4df40a MOV (%RCX,%RAX,8),%RSI |
(4321) 0x4df40e MOV -0x48(%RBP),%RAX |
(4321) 0x4df412 MOV (%RAX,%RSI,8),%RCX |
(4321) 0x4df416 MOV 0x8(%RAX,%RSI,8),%R8 |
(4321) 0x4df41b MOV %R8,%RAX |
(4321) 0x4df41e MOV %RCX,-0x60(%RBP) |
(4321) 0x4df422 SUB %RCX,%RAX |
(4321) 0x4df425 MOV %RAX,-0x58(%RBP) |
(4321) 0x4df429 JLE 4df3e0 |
(4321) 0x4df42b MOV -0x58(%RBP),%R9 |
(4321) 0x4df42f AND $-0x4,%R9 |
(4321) 0x4df433 LEA -0x1(%R9),%R11 |
(4321) 0x4df437 IMUL -0x80(%RBP),%RSI |
(4321) 0x4df43c MOV -0x60(%RBP),%RCX |
(4321) 0x4df440 LEA (%R13,%RCX,8),%RAX |
(4321) 0x4df445 LEA (%R15,%RCX,8),%RCX |
(4321) 0x4df449 XOR %EDI,%EDI |
(4321) 0x4df44b JMP 4df485 |
0x4df44d NOPL (%RAX) |
(4323) 0x4df450 MOV %RDI,%RDX |
(4323) 0x4df453 IMUL -0x98(%RBP),%RDX |
(4323) 0x4df45b ADD %RSI,%RDX |
(4323) 0x4df45e MOV -0x68(%RBP),%R10 |
(4323) 0x4df462 VADDSD (%R10,%RDX,8),%XMM1,%XMM1 |
(4323) 0x4df468 VMOVSD %XMM1,(%R10,%RDX,8) |
(4323) 0x4df46e LEA 0x1(%RDI),%RDX |
(4323) 0x4df472 MOV -0x90(%RBP),%R10 |
(4323) 0x4df479 CMP %R10,%RDI |
(4323) 0x4df47c MOV %RDX,%RDI |
(4323) 0x4df47f JE 4df3e0 |
(4323) 0x4df485 TEST %R9,%R9 |
(4323) 0x4df488 JE 4df4f0 |
(4323) 0x4df48a MOV %RDI,%RDX |
(4323) 0x4df48d IMUL -0x70(%RBP),%RDX |
(4323) 0x4df492 VPBROADCASTQ %RDX,%YMM2 |
(4323) 0x4df498 VXORPD %XMM1,%XMM1,%XMM1 |
(4323) 0x4df49c XOR %EDX,%EDX |
(4323) 0x4df49e XCHG %AX,%AX |
(4324) 0x4df4a0 VXORPS %XMM3,%XMM3,%XMM3 |
(4324) 0x4df4a4 VPMULLQ (%RCX,%RDX,8),%YMM0,%YMM3 |
(4324) 0x4df4ab VPADDQ %YMM2,%YMM3,%YMM3 |
(4324) 0x4df4af KXNORW %K0,%K0,%K1 |
(4324) 0x4df4b3 VXORPD %XMM4,%XMM4,%XMM4 |
(4324) 0x4df4b7 VGATHERQPD (%R14,%YMM3,8),%YMM4{%K1} |
(4324) 0x4df4be VFMADD231PD (%RAX,%RDX,8),%YMM4,%YMM1 |
(4324) 0x4df4c4 ADD $0x4,%RDX |
(4324) 0x4df4c8 CMP %R11,%RDX |
(4324) 0x4df4cb JBE 4df4a0 |
(4323) 0x4df4cd VEXTRACTF128 $0x1,%YMM1,%XMM2 |
(4323) 0x4df4d3 VADDPD %XMM2,%XMM1,%XMM1 |
(4323) 0x4df4d7 VSHUFPD $0x1,%XMM1,%XMM1,%XMM2 |
(4323) 0x4df4dc VADDSD %XMM2,%XMM1,%XMM1 |
(4323) 0x4df4e0 MOV %R9,%R10 |
(4323) 0x4df4e3 CMP %R9,-0x58(%RBP) |
(4323) 0x4df4e7 JE 4df450 |
(4323) 0x4df4ed JMP 4df4f7 |
0x4df4ef NOP |
(4323) 0x4df4f0 VXORPD %XMM1,%XMM1,%XMM1 |
(4323) 0x4df4f4 XOR %R10D,%R10D |
(4323) 0x4df4f7 MOV %RDI,%RDX |
(4323) 0x4df4fa IMUL -0x70(%RBP),%RDX |
(4323) 0x4df4ff ADD -0x60(%RBP),%R10 |
(4323) 0x4df503 NOPW %CS:(%RAX,%RAX,1) |
(4322) 0x4df510 MOV (%R15,%R10,8),%RBX |
(4322) 0x4df514 IMUL %R12,%RBX |
(4322) 0x4df518 ADD %RDX,%RBX |
(4322) 0x4df51b VMOVSD (%R14,%RBX,8),%XMM2 |
(4322) 0x4df521 VFMADD231SD (%R13,%R10,8),%XMM2,%XMM1 |
(4322) 0x4df528 INC %R10 |
(4322) 0x4df52b CMP %R10,%R8 |
(4322) 0x4df52e JNE 4df510 |
(4323) 0x4df530 JMP 4df450 |
0x4df535 NOPW %CS:(%RAX,%RAX,1) |
Path / |
Source file and lines | csr_matvec.c:178-204 |
Module | exec |
nb instructions | 72 |
nb uops | 74 |
loop length | 273 |
used x86 registers | 14 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 1 |
used zmm registers | 0 |
nb stack references | 21 |
micro-operation queue | 12.33 cycles |
front end | 12.33 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 1.90 | 1.80 | 5.67 | 5.67 | 11.50 | 1.80 | 1.70 | 11.50 | 11.50 | 11.50 | 1.80 | 5.67 |
cycles | 1.90 | 1.80 | 5.67 | 5.67 | 11.50 | 1.80 | 1.70 | 11.50 | 11.50 | 11.50 | 1.80 | 5.67 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 12.11 |
Stall cycles | 0.00 |
Front-end | 12.33 |
Dispatch | 11.50 |
Overall L1 | 12.33 |
all | 3% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
all | 11% |
load | 10% |
store | 10% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 11% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R15 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R14 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R13 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R12 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RBX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
SUB $0x78,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV 0x40(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x70(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x38(%RBP),%R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x30(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x98(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x28(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x80(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x20(%RBP),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x18(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x68(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x10(%RBP),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVL $0,-0x30(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R9,-0x50(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R8,%R15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV %RCX,-0x48(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RDX,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV (%RDI),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVQ $0,-0x78(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOVQ $0x1,-0xa0(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
SUB $0x8,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
LEA -0xa0(%RBP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA -0x30(%RBP),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA -0x78(%RBP),%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA 0x50(%RBP),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV $0x537d70,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %ESI,-0x2c(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV $0x22,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
PUSH $0x1 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH $0x1 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RAX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
CALL 410420 <__kmpc_for_static_init_8@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
ADD $0x20,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV -0x78(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x50(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x40(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
SUB %RAX,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %RCX,-0x38(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
JAE 4df2df <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0xcf> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV $0x537d90,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV -0x2c(%RBP),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD $0x78,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
POP %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
VZEROUPPER | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
JMP 410170 <__kmpc_for_static_fini@plt> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
CMP $0x1,%RBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JNE 4df3be <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x1ae> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 4df30b <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0xfb> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JL 4df2c1 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0xb1> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
DEC %RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV %RBX,-0x90(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VPBROADCASTQ %R12,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 4df3f8 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x1e8> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
Source file and lines | csr_matvec.c:178-204 |
Module | exec |
nb instructions | 72 |
nb uops | 74 |
loop length | 273 |
used x86 registers | 14 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 1 |
used zmm registers | 0 |
nb stack references | 21 |
micro-operation queue | 12.33 cycles |
front end | 12.33 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 1.90 | 1.80 | 5.67 | 5.67 | 11.50 | 1.80 | 1.70 | 11.50 | 11.50 | 11.50 | 1.80 | 5.67 |
cycles | 1.90 | 1.80 | 5.67 | 5.67 | 11.50 | 1.80 | 1.70 | 11.50 | 11.50 | 11.50 | 1.80 | 5.67 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 12.11 |
Stall cycles | 0.00 |
Front-end | 12.33 |
Dispatch | 11.50 |
Overall L1 | 12.33 |
all | 3% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
all | 11% |
load | 10% |
store | 10% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 11% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R15 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R14 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R13 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R12 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RBX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
SUB $0x78,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV 0x40(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x70(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x38(%RBP),%R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x30(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x98(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x28(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x80(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x20(%RBP),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x18(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x68(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x10(%RBP),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVL $0,-0x30(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R9,-0x50(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R8,%R15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV %RCX,-0x48(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RDX,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV (%RDI),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVQ $0,-0x78(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOVQ $0x1,-0xa0(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
SUB $0x8,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
LEA -0xa0(%RBP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA -0x30(%RBP),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA -0x78(%RBP),%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA 0x50(%RBP),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV $0x537d70,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %ESI,-0x2c(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV $0x22,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
PUSH $0x1 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH $0x1 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RAX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
CALL 410420 <__kmpc_for_static_init_8@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
ADD $0x20,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV -0x78(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x50(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x40(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
SUB %RAX,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %RCX,-0x38(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
JAE 4df2df <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0xcf> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV $0x537d90,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV -0x2c(%RBP),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD $0x78,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
POP %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
VZEROUPPER | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
JMP 410170 <__kmpc_for_static_fini@plt> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
CMP $0x1,%RBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JNE 4df3be <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x1ae> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 4df30b <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0xfb> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JL 4df2c1 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0xb1> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
DEC %RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV %RBX,-0x90(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VPBROADCASTQ %R12,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 4df3f8 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x1e8> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
Name | Coverage (%) | Time (s) |
---|---|---|
▼hypre_CSRMatrixMatvecOutOfPlace.extracted.19– | 0.02 | 0 |
▼Loop 4321 - csr_matvec.c:178-204 - exec– | 0 | 0 |
▼Loop 4323 - csr_matvec.c:199-204 - exec– | 0 | 0 |
○Loop 4322 - csr_matvec.c:202-203 - exec | 0 | 0 |
○Loop 4324 - csr_matvec.c:202-203 - exec | 0 | 0 |
▼Loop 4325 - csr_matvec.c:178-196 - exec– | 0 | 0 |
○Loop 4327 - csr_matvec.c:194-195 - exec | 0.02 | 0 |
○Loop 4326 - csr_matvec.c:194-195 - exec | 0 | 0 |