Loop Id: 3089 | Module: exec | Source: csr_matvec.c:307-314 | Coverage: 7.48% |
---|
Loop Id: 3089 | Module: exec | Source: csr_matvec.c:307-314 | Coverage: 7.48% |
---|
0x580880 MOV 0x30(%RSP),%R15 |
0x580885 MOV (%R8,%R12,8),%RDX |
0x580889 MOV 0x8(%R8,%R12,8),%RCX |
0x58088e VMOVSD (%R15,%R12,8),%XMM4 |
0x580894 CMP %RCX,%RDX |
0x580897 JGE 5810f0 |
0x58089d SUB %RDX,%RCX |
0x5808a0 MOV %RDX,%R15 |
0x5808a3 LEA -0x1(%RCX),%RSI |
0x5808a7 CMP $0x2,%RSI |
0x5808ab JBE 5816c1 |
0x5808b1 MOV %RCX,%R11 |
0x5808b4 LEA (,%RDX,8),%RSI |
0x5808bc VXORPD %XMM6,%XMM6,%XMM6 |
0x5808c0 XOR %EAX,%EAX |
0x5808c2 SHR $0x2,%R11 |
0x5808c6 LEA (%R14,%RSI,1),%R10 |
0x5808ca ADD %R13,%RSI |
0x5808cd SAL $0x5,%R11 |
0x5808d1 LEA -0x20(%R11),%RDI |
0x5808d5 SHR $0x5,%RDI |
0x5808d9 INC %RDI |
0x5808dc AND $0x7,%EDI |
0x5808df JE 5809cd |
0x5808e5 CMP $0x1,%RDI |
0x5808e9 JE 5809aa |
0x5808ef CMP $0x2,%RDI |
0x5808f3 JE 580990 |
0x5808f9 CMP $0x3,%RDI |
0x5808fd JE 580976 |
0x5808ff CMP $0x4,%RDI |
0x580903 JE 58095d |
0x580905 CMP $0x5,%RDI |
0x580909 JE 580943 |
0x58090b CMP $0x6,%RDI |
0x58090f JE 580929 |
0x580911 VMOVDQU (%RSI),%YMM9 |
0x580915 VMOVAPD %YMM12,%YMM1 |
0x580919 MOV $0x20,%EAX |
0x58091e VGATHERQPD %YMM1,(%RBX,%YMM9,8),%YMM8 |
0x580924 VFNMADD231PD (%R10),%YMM8,%YMM6 |
0x580929 VMOVDQU (%RSI,%RAX,1),%YMM11 |
0x58092e VMOVAPD %YMM12,%YMM14 |
0x580933 VGATHERQPD %YMM14,(%RBX,%YMM11,8),%YMM2 |
0x580939 VFNMADD231PD (%R10,%RAX,1),%YMM2,%YMM6 |
0x58093f ADD $0x20,%RAX |
0x580943 VMOVDQU (%RSI,%RAX,1),%YMM10 |
0x580948 VMOVAPD %YMM12,%YMM15 |
0x58094d VGATHERQPD %YMM15,(%RBX,%YMM10,8),%YMM0 |
0x580953 VFNMADD231PD (%R10,%RAX,1),%YMM0,%YMM6 |
0x580959 ADD $0x20,%RAX |
0x58095d VMOVDQU (%RSI,%RAX,1),%YMM7 |
0x580962 VMOVAPD %YMM12,%YMM3 |
0x580966 VGATHERQPD %YMM3,(%RBX,%YMM7,8),%YMM13 |
0x58096c VFNMADD231PD (%R10,%RAX,1),%YMM13,%YMM6 |
0x580972 ADD $0x20,%RAX |
0x580976 VMOVDQU (%RSI,%RAX,1),%YMM8 |
0x58097b VMOVAPD %YMM12,%YMM9 |
0x580980 VGATHERQPD %YMM9,(%RBX,%YMM8,8),%YMM1 |
0x580986 VFNMADD231PD (%R10,%RAX,1),%YMM1,%YMM6 |
0x58098c ADD $0x20,%RAX |
0x580990 VMOVDQU (%RSI,%RAX,1),%YMM11 |
0x580995 VMOVAPD %YMM12,%YMM14 |
0x58099a VGATHERQPD %YMM14,(%RBX,%YMM11,8),%YMM2 |
0x5809a0 VFNMADD231PD (%R10,%RAX,1),%YMM2,%YMM6 |
0x5809a6 ADD $0x20,%RAX |
0x5809aa VMOVDQU (%RSI,%RAX,1),%YMM10 |
0x5809af VMOVAPD %YMM12,%YMM15 |
0x5809b4 VGATHERQPD %YMM15,(%RBX,%YMM10,8),%YMM0 |
0x5809ba VFNMADD231PD (%R10,%RAX,1),%YMM0,%YMM6 |
0x5809c0 ADD $0x20,%RAX |
0x5809c4 CMP %R11,%RAX |
0x5809c7 JE 580aae |
(3090) 0x5809cd VMOVDQU (%RSI,%RAX,1),%YMM7 |
(3090) 0x5809d2 VMOVDQU 0x20(%RSI,%RAX,1),%YMM9 |
(3090) 0x5809d8 VMOVAPD %YMM12,%YMM3 |
(3090) 0x5809dc VMOVAPD %YMM12,%YMM1 |
(3090) 0x5809e0 VMOVDQU 0x40(%RSI,%RAX,1),%YMM14 |
(3090) 0x5809e6 VMOVDQU 0x60(%RSI,%RAX,1),%YMM2 |
(3090) 0x5809ec VMOVAPD %YMM12,%YMM15 |
(3090) 0x5809f1 VGATHERQPD %YMM3,(%RBX,%YMM7,8),%YMM13 |
(3090) 0x5809f7 VFNMADD132PD (%R10,%RAX,1),%YMM6,%YMM13 |
(3090) 0x5809fd VMOVAPD %YMM12,%YMM6 |
(3090) 0x580a01 VMOVDQU 0x80(%RSI,%RAX,1),%YMM7 |
(3090) 0x580a0a VGATHERQPD %YMM6,(%RBX,%YMM9,8),%YMM8 |
(3090) 0x580a10 VFNMADD132PD 0x20(%R10,%RAX,1),%YMM13,%YMM8 |
(3090) 0x580a17 VMOVAPD %YMM12,%YMM3 |
(3090) 0x580a1b VMOVDQU 0xa0(%RSI,%RAX,1),%YMM6 |
(3090) 0x580a24 VGATHERQPD %YMM1,(%RBX,%YMM14,8),%YMM11 |
(3090) 0x580a2a VFNMADD132PD 0x40(%R10,%RAX,1),%YMM8,%YMM11 |
(3090) 0x580a31 VMOVAPD %YMM12,%YMM13 |
(3090) 0x580a36 VMOVDQU 0xc0(%RSI,%RAX,1),%YMM1 |
(3090) 0x580a3f VGATHERQPD %YMM15,(%RBX,%YMM2,8),%YMM10 |
(3090) 0x580a45 VFNMADD132PD 0x60(%R10,%RAX,1),%YMM11,%YMM10 |
(3090) 0x580a4c VMOVAPD %YMM12,%YMM8 |
(3090) 0x580a51 VMOVDQU 0xe0(%RSI,%RAX,1),%YMM15 |
(3090) 0x580a5a VGATHERQPD %YMM3,(%RBX,%YMM7,8),%YMM0 |
(3090) 0x580a60 VMOVAPD %YMM12,%YMM11 |
(3090) 0x580a65 VGATHERQPD %YMM13,(%RBX,%YMM6,8),%YMM9 |
(3090) 0x580a6b VFNMADD132PD 0x80(%R10,%RAX,1),%YMM10,%YMM0 |
(3090) 0x580a75 VGATHERQPD %YMM8,(%RBX,%YMM1,8),%YMM14 |
(3090) 0x580a7b VGATHERQPD %YMM11,(%RBX,%YMM15,8),%YMM6 |
(3090) 0x580a81 VFNMADD132PD 0xa0(%R10,%RAX,1),%YMM0,%YMM9 |
(3090) 0x580a8b VFNMADD132PD 0xc0(%R10,%RAX,1),%YMM9,%YMM14 |
(3090) 0x580a95 VFNMADD132PD 0xe0(%R10,%RAX,1),%YMM14,%YMM6 |
(3090) 0x580a9f ADD $0x100,%RAX |
(3090) 0x580aa5 CMP %R11,%RAX |
(3090) 0x580aa8 JNE 5809cd |
0x580aae VEXTRACTF128 $0x1,%YMM6,%XMM10 |
0x580ab4 MOV %RCX,%R10 |
0x580ab7 VADDPD %XMM6,%XMM10,%XMM2 |
0x580abb AND $-0x4,%R10 |
0x580abf VADDPD %XMM6,%XMM10,%XMM13 |
0x580ac3 ADD %R10,%RDX |
0x580ac6 VUNPCKHPD %XMM2,%XMM2,%XMM3 |
0x580aca VADDPD %XMM2,%XMM3,%XMM7 |
0x580ace VADDSD %XMM7,%XMM4,%XMM14 |
0x580ad2 TEST $0x3,%CL |
0x580ad5 JE 580b24 |
0x580ad7 SUB %R10,%RCX |
0x580ada CMP $0x1,%RCX |
0x580ade JE 580b14 |
0x580ae0 ADD %R15,%R10 |
0x580ae3 VMOVAPD %XMM5,%XMM0 |
0x580ae7 VMOVDQU (%R13,%R10,8),%XMM9 |
0x580aee VGATHERQPD %XMM0,(%RBX,%XMM9,8),%XMM8 |
0x580af4 VFNMADD132PD (%R14,%R10,8),%XMM13,%XMM8 |
0x580afa VUNPCKHPD %XMM8,%XMM8,%XMM13 |
0x580aff VADDPD %XMM8,%XMM13,%XMM1 |
0x580b04 VADDSD %XMM4,%XMM1,%XMM14 |
0x580b08 TEST $0x1,%CL |
0x580b0b JE 580b24 |
0x580b0d AND $-0x2,%RCX |
0x580b11 ADD %RCX,%RDX |
0x580b14 MOV (%R13,%RDX,8),%RCX |
0x580b19 VMOVSD (%RBX,%RCX,8),%XMM4 |
0x580b1e VFNMADD231SD (%R14,%RDX,8),%XMM4,%XMM14 |
0x580b24 MOV 0x38(%RSP),%RDX |
0x580b29 VMOVSD %XMM14,(%RDX,%R12,8) |
0x580b2f INC %R12 |
0x580b32 CMP %R12,%R9 |
0x580b35 JNE 580880 |
0x5810f0 VMOVSD %XMM4,%XMM4,%XMM14 |
0x5810f4 JMP 580b24 |
0x5816c1 VMOVSD %XMM4,%XMM4,%XMM14 |
0x5816c5 VXORPD %XMM13,%XMM13,%XMM13 |
0x5816ca XOR %R10D,%R10D |
0x5816cd JMP 580ad7 |
/scratch_na/users/xoserete/qaas_runs/171-172-8217/intel/AMG/build/AMG/AMG/seq_mv/csr_matvec.c: 307 - 314 |
-------------------------------------------------------------------------------- |
307: for (i = iBegin; i < iEnd; i++) |
308: { |
309: tempx = b_data[i]; |
310: for (jj = A_i[i]; jj < A_i[i+1]; jj++) |
311: { |
312: tempx -= A_data[jj] * x_data[A_j[jj]]; |
313: } |
314: y_data[i] = tempx; |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.46 |
CQA speedup if FP arith vectorized | 1.54 |
CQA speedup if fully vectorized | 3.37 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.40 |
Bottlenecks | micro-operation queue, |
Function | hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6 |
Source | csr_matvec.c:307-314 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 25.50 |
CQA cycles if no scalar integer | 17.50 |
CQA cycles if FP arith vectorized | 16.54 |
CQA cycles if fully vectorized | 7.56 |
Front-end cycles | 25.50 |
DIV/SQRT cycles | 18.17 |
P0 cycles | 18.20 |
P1 cycles | 18.00 |
P2 cycles | 18.00 |
P3 cycles | 0.50 |
P4 cycles | 18.23 |
P5 cycles | 18.20 |
P6 cycles | 0.50 |
P7 cycles | 0.50 |
P8 cycles | 0.50 |
P9 cycles | 18.20 |
P10 cycles | 18.00 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | 43.94 - 255.80 |
Stall cycles (UFS) | 17.76 - 229.62 |
Nb insns | 113.00 |
Nb uops | 145.00 |
Nb loads | 32.00 |
Nb stores | 1.00 |
Nb stack references | 2.00 |
FLOP/cycle | 2.82 |
Nb FLOP add-sub | 10.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 31.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 31.06 |
Bytes prefetched | 0.00 |
Bytes loaded | 784.00 |
Bytes stored | 8.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 62.90 |
Vectorization ratio load | 88.89 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 57.14 |
Vectorization ratio fma | 88.89 |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 54.29 |
Vector-efficiency ratio all | 31.55 |
Vector-efficiency ratio load | 43.06 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 19.64 |
Vector-efficiency ratio fma | 43.06 |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 29.11 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.46 |
CQA speedup if FP arith vectorized | 1.54 |
CQA speedup if fully vectorized | 3.37 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.40 |
Bottlenecks | micro-operation queue, |
Function | hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6 |
Source | csr_matvec.c:307-314 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 25.50 |
CQA cycles if no scalar integer | 17.50 |
CQA cycles if FP arith vectorized | 16.54 |
CQA cycles if fully vectorized | 7.56 |
Front-end cycles | 25.50 |
DIV/SQRT cycles | 18.17 |
P0 cycles | 18.20 |
P1 cycles | 18.00 |
P2 cycles | 18.00 |
P3 cycles | 0.50 |
P4 cycles | 18.23 |
P5 cycles | 18.20 |
P6 cycles | 0.50 |
P7 cycles | 0.50 |
P8 cycles | 0.50 |
P9 cycles | 18.20 |
P10 cycles | 18.00 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | 43.94 - 255.80 |
Stall cycles (UFS) | 17.76 - 229.62 |
Nb insns | 113.00 |
Nb uops | 145.00 |
Nb loads | 32.00 |
Nb stores | 1.00 |
Nb stack references | 2.00 |
FLOP/cycle | 2.82 |
Nb FLOP add-sub | 10.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 31.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 31.06 |
Bytes prefetched | 0.00 |
Bytes loaded | 784.00 |
Bytes stored | 8.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 62.90 |
Vectorization ratio load | 88.89 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 57.14 |
Vectorization ratio fma | 88.89 |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 54.29 |
Vector-efficiency ratio all | 31.55 |
Vector-efficiency ratio load | 43.06 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 19.64 |
Vector-efficiency ratio fma | 43.06 |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 29.11 |
Path / |
Function | hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6 |
Source file and lines | csr_matvec.c:307-314 |
Module | exec |
nb instructions | 113 |
nb uops | 145 |
loop length | 500 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 13 |
used ymm registers | 14 |
used zmm registers | 0 |
nb stack references | 2 |
micro-operation queue | 25.50 cycles |
front end | 25.50 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 18.17 | 18.20 | 18.00 | 18.00 | 0.50 | 18.23 | 18.20 | 0.50 | 0.50 | 0.50 | 18.20 | 18.00 |
cycles | 18.17 | 18.20 | 18.00 | 18.00 | 0.50 | 18.23 | 18.20 | 0.50 | 0.50 | 0.50 | 18.20 | 18.00 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 43.94-255.80 |
Stall cycles | 17.76-229.62 |
ROB full (events) | 21.56-203.40 |
Front-end | 25.50 |
Dispatch | 18.23 |
Overall L1 | 25.50 |
all | 38% |
load | 100% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 75% |
load | 84% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 66% |
fma | 88% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 82% |
all | 62% |
load | 88% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 57% |
fma | 88% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 54% |
all | 25% |
load | 46% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 11% |
all | 34% |
load | 41% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 20% |
fma | 43% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 38% |
all | 31% |
load | 43% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 19% |
fma | 43% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 29% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MOV 0x30(%RSP),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%R8,%R12,8),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x8(%R8,%R12,8),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD (%R15,%R12,8),%XMM4 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
CMP %RCX,%RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JGE 5810f0 <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x1f40> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
SUB %RDX,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %RDX,%R15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
LEA -0x1(%RCX),%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
CMP $0x2,%RSI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JBE 5816c1 <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x2511> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV %RCX,%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
LEA (,%RDX,8),%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VXORPD %XMM6,%XMM6,%XMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
XOR %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
SHR $0x2,%R11 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
LEA (%R14,%RSI,1),%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD %R13,%RSI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
SAL $0x5,%R11 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
LEA -0x20(%R11),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
SHR $0x5,%RDI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
INC %RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
AND $0x7,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
JE 5809cd <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x181d> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
CMP $0x1,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JE 5809aa <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x17fa> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
CMP $0x2,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JE 580990 <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x17e0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
CMP $0x3,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JE 580976 <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x17c6> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
CMP $0x4,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JE 58095d <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x17ad> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
CMP $0x5,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JE 580943 <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x1793> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
CMP $0x6,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JE 580929 <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x1779> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
VMOVDQU (%RSI),%YMM9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VMOVAPD %YMM12,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
MOV $0x20,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
VGATHERQPD %YMM1,(%RBX,%YMM9,8),%YMM8 | 5 | 1.33 | 1.33 | 1.33 | 1.33 | 0 | 1.33 | 0 | 0 | 0 | 0 | 0 | 1.33 | 0-29 | 2 |
VFNMADD231PD (%R10),%YMM8,%YMM6 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VMOVDQU (%RSI,%RAX,1),%YMM11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VMOVAPD %YMM12,%YMM14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VGATHERQPD %YMM14,(%RBX,%YMM11,8),%YMM2 | 5 | 1.33 | 1.33 | 1.33 | 1.33 | 0 | 1.33 | 0 | 0 | 0 | 0 | 0 | 1.33 | 0-29 | 2 |
VFNMADD231PD (%R10,%RAX,1),%YMM2,%YMM6 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
ADD $0x20,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
VMOVDQU (%RSI,%RAX,1),%YMM10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VMOVAPD %YMM12,%YMM15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VGATHERQPD %YMM15,(%RBX,%YMM10,8),%YMM0 | 5 | 1.33 | 1.33 | 1.33 | 1.33 | 0 | 1.33 | 0 | 0 | 0 | 0 | 0 | 1.33 | 0-29 | 2 |
VFNMADD231PD (%R10,%RAX,1),%YMM0,%YMM6 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
ADD $0x20,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
VMOVDQU (%RSI,%RAX,1),%YMM7 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VMOVAPD %YMM12,%YMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VGATHERQPD %YMM3,(%RBX,%YMM7,8),%YMM13 | 5 | 1.33 | 1.33 | 1.33 | 1.33 | 0 | 1.33 | 0 | 0 | 0 | 0 | 0 | 1.33 | 0-29 | 2 |
VFNMADD231PD (%R10,%RAX,1),%YMM13,%YMM6 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
ADD $0x20,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
VMOVDQU (%RSI,%RAX,1),%YMM8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VMOVAPD %YMM12,%YMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VGATHERQPD %YMM9,(%RBX,%YMM8,8),%YMM1 | 5 | 1.33 | 1.33 | 1.33 | 1.33 | 0 | 1.33 | 0 | 0 | 0 | 0 | 0 | 1.33 | 0-29 | 2 |
VFNMADD231PD (%R10,%RAX,1),%YMM1,%YMM6 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
ADD $0x20,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
VMOVDQU (%RSI,%RAX,1),%YMM11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VMOVAPD %YMM12,%YMM14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VGATHERQPD %YMM14,(%RBX,%YMM11,8),%YMM2 | 5 | 1.33 | 1.33 | 1.33 | 1.33 | 0 | 1.33 | 0 | 0 | 0 | 0 | 0 | 1.33 | 0-29 | 2 |
VFNMADD231PD (%R10,%RAX,1),%YMM2,%YMM6 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
ADD $0x20,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
VMOVDQU (%RSI,%RAX,1),%YMM10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VMOVAPD %YMM12,%YMM15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VGATHERQPD %YMM15,(%RBX,%YMM10,8),%YMM0 | 5 | 1.33 | 1.33 | 1.33 | 1.33 | 0 | 1.33 | 0 | 0 | 0 | 0 | 0 | 1.33 | 0-29 | 2 |
VFNMADD231PD (%R10,%RAX,1),%YMM0,%YMM6 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
ADD $0x20,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CMP %R11,%RAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JE 580aae <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x18fe> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
VEXTRACTF128 $0x1,%YMM6,%XMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %RCX,%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
VADDPD %XMM6,%XMM10,%XMM2 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
AND $-0x4,%R10 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
VADDPD %XMM6,%XMM10,%XMM13 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
ADD %R10,%RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
VUNPCKHPD %XMM2,%XMM2,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VADDPD %XMM2,%XMM3,%XMM7 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VADDSD %XMM7,%XMM4,%XMM14 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
TEST $0x3,%CL | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
JE 580b24 <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x1974> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
SUB %R10,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP $0x1,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JE 580b14 <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x1964> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
ADD %R15,%R10 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
VMOVAPD %XMM5,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVDQU (%R13,%R10,8),%XMM9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VGATHERQPD %XMM0,(%RBX,%XMM9,8),%XMM8 | 5 | 1.33 | 0.83 | 0.67 | 0.67 | 0 | 0.83 | 0 | 0 | 0 | 0 | 0 | 0.67 | 0-29 | 1.25 |
VFNMADD132PD (%R14,%R10,8),%XMM13,%XMM8 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VUNPCKHPD %XMM8,%XMM8,%XMM13 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VADDPD %XMM8,%XMM13,%XMM1 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VADDSD %XMM4,%XMM1,%XMM14 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
TEST $0x1,%CL | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
JE 580b24 <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x1974> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
AND $-0x2,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
ADD %RCX,%RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV (%R13,%RDX,8),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD (%RBX,%RCX,8),%XMM4 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VFNMADD231SD (%R14,%RDX,8),%XMM4,%XMM14 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
MOV 0x38(%RSP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD %XMM14,(%RDX,%R12,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
INC %R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CMP %R12,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JNE 580880 <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x16d0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
VMOVSD %XMM4,%XMM4,%XMM14 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
JMP 580b24 <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x1974> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
VMOVSD %XMM4,%XMM4,%XMM14 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VXORPD %XMM13,%XMM13,%XMM13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
XOR %R10D,%R10D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 580ad7 <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x1927> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
Function | hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6 |
Source file and lines | csr_matvec.c:307-314 |
Module | exec |
nb instructions | 113 |
nb uops | 145 |
loop length | 500 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 13 |
used ymm registers | 14 |
used zmm registers | 0 |
nb stack references | 2 |
micro-operation queue | 25.50 cycles |
front end | 25.50 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 18.17 | 18.20 | 18.00 | 18.00 | 0.50 | 18.23 | 18.20 | 0.50 | 0.50 | 0.50 | 18.20 | 18.00 |
cycles | 18.17 | 18.20 | 18.00 | 18.00 | 0.50 | 18.23 | 18.20 | 0.50 | 0.50 | 0.50 | 18.20 | 18.00 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 43.94-255.80 |
Stall cycles | 17.76-229.62 |
ROB full (events) | 21.56-203.40 |
Front-end | 25.50 |
Dispatch | 18.23 |
Overall L1 | 25.50 |
all | 38% |
load | 100% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 75% |
load | 84% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 66% |
fma | 88% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 82% |
all | 62% |
load | 88% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 57% |
fma | 88% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 54% |
all | 25% |
load | 46% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 11% |
all | 34% |
load | 41% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 20% |
fma | 43% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 38% |
all | 31% |
load | 43% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 19% |
fma | 43% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 29% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MOV 0x30(%RSP),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%R8,%R12,8),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x8(%R8,%R12,8),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD (%R15,%R12,8),%XMM4 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
CMP %RCX,%RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JGE 5810f0 <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x1f40> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
SUB %RDX,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %RDX,%R15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
LEA -0x1(%RCX),%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
CMP $0x2,%RSI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JBE 5816c1 <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x2511> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV %RCX,%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
LEA (,%RDX,8),%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VXORPD %XMM6,%XMM6,%XMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
XOR %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
SHR $0x2,%R11 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
LEA (%R14,%RSI,1),%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD %R13,%RSI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
SAL $0x5,%R11 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
LEA -0x20(%R11),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
SHR $0x5,%RDI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
INC %RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
AND $0x7,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
JE 5809cd <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x181d> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
CMP $0x1,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JE 5809aa <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x17fa> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
CMP $0x2,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JE 580990 <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x17e0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
CMP $0x3,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JE 580976 <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x17c6> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
CMP $0x4,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JE 58095d <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x17ad> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
CMP $0x5,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JE 580943 <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x1793> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
CMP $0x6,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JE 580929 <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x1779> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
VMOVDQU (%RSI),%YMM9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VMOVAPD %YMM12,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
MOV $0x20,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
VGATHERQPD %YMM1,(%RBX,%YMM9,8),%YMM8 | 5 | 1.33 | 1.33 | 1.33 | 1.33 | 0 | 1.33 | 0 | 0 | 0 | 0 | 0 | 1.33 | 0-29 | 2 |
VFNMADD231PD (%R10),%YMM8,%YMM6 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VMOVDQU (%RSI,%RAX,1),%YMM11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VMOVAPD %YMM12,%YMM14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VGATHERQPD %YMM14,(%RBX,%YMM11,8),%YMM2 | 5 | 1.33 | 1.33 | 1.33 | 1.33 | 0 | 1.33 | 0 | 0 | 0 | 0 | 0 | 1.33 | 0-29 | 2 |
VFNMADD231PD (%R10,%RAX,1),%YMM2,%YMM6 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
ADD $0x20,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
VMOVDQU (%RSI,%RAX,1),%YMM10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VMOVAPD %YMM12,%YMM15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VGATHERQPD %YMM15,(%RBX,%YMM10,8),%YMM0 | 5 | 1.33 | 1.33 | 1.33 | 1.33 | 0 | 1.33 | 0 | 0 | 0 | 0 | 0 | 1.33 | 0-29 | 2 |
VFNMADD231PD (%R10,%RAX,1),%YMM0,%YMM6 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
ADD $0x20,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
VMOVDQU (%RSI,%RAX,1),%YMM7 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VMOVAPD %YMM12,%YMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VGATHERQPD %YMM3,(%RBX,%YMM7,8),%YMM13 | 5 | 1.33 | 1.33 | 1.33 | 1.33 | 0 | 1.33 | 0 | 0 | 0 | 0 | 0 | 1.33 | 0-29 | 2 |
VFNMADD231PD (%R10,%RAX,1),%YMM13,%YMM6 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
ADD $0x20,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
VMOVDQU (%RSI,%RAX,1),%YMM8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VMOVAPD %YMM12,%YMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VGATHERQPD %YMM9,(%RBX,%YMM8,8),%YMM1 | 5 | 1.33 | 1.33 | 1.33 | 1.33 | 0 | 1.33 | 0 | 0 | 0 | 0 | 0 | 1.33 | 0-29 | 2 |
VFNMADD231PD (%R10,%RAX,1),%YMM1,%YMM6 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
ADD $0x20,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
VMOVDQU (%RSI,%RAX,1),%YMM11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VMOVAPD %YMM12,%YMM14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VGATHERQPD %YMM14,(%RBX,%YMM11,8),%YMM2 | 5 | 1.33 | 1.33 | 1.33 | 1.33 | 0 | 1.33 | 0 | 0 | 0 | 0 | 0 | 1.33 | 0-29 | 2 |
VFNMADD231PD (%R10,%RAX,1),%YMM2,%YMM6 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
ADD $0x20,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
VMOVDQU (%RSI,%RAX,1),%YMM10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VMOVAPD %YMM12,%YMM15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VGATHERQPD %YMM15,(%RBX,%YMM10,8),%YMM0 | 5 | 1.33 | 1.33 | 1.33 | 1.33 | 0 | 1.33 | 0 | 0 | 0 | 0 | 0 | 1.33 | 0-29 | 2 |
VFNMADD231PD (%R10,%RAX,1),%YMM0,%YMM6 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
ADD $0x20,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CMP %R11,%RAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JE 580aae <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x18fe> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
VEXTRACTF128 $0x1,%YMM6,%XMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %RCX,%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
VADDPD %XMM6,%XMM10,%XMM2 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
AND $-0x4,%R10 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
VADDPD %XMM6,%XMM10,%XMM13 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
ADD %R10,%RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
VUNPCKHPD %XMM2,%XMM2,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VADDPD %XMM2,%XMM3,%XMM7 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VADDSD %XMM7,%XMM4,%XMM14 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
TEST $0x3,%CL | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
JE 580b24 <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x1974> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
SUB %R10,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP $0x1,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JE 580b14 <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x1964> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
ADD %R15,%R10 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
VMOVAPD %XMM5,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVDQU (%R13,%R10,8),%XMM9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VGATHERQPD %XMM0,(%RBX,%XMM9,8),%XMM8 | 5 | 1.33 | 0.83 | 0.67 | 0.67 | 0 | 0.83 | 0 | 0 | 0 | 0 | 0 | 0.67 | 0-29 | 1.25 |
VFNMADD132PD (%R14,%R10,8),%XMM13,%XMM8 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VUNPCKHPD %XMM8,%XMM8,%XMM13 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VADDPD %XMM8,%XMM13,%XMM1 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VADDSD %XMM4,%XMM1,%XMM14 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
TEST $0x1,%CL | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
JE 580b24 <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x1974> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
AND $-0x2,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
ADD %RCX,%RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV (%R13,%RDX,8),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD (%RBX,%RCX,8),%XMM4 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VFNMADD231SD (%R14,%RDX,8),%XMM4,%XMM14 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
MOV 0x38(%RSP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD %XMM14,(%RDX,%R12,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
INC %R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CMP %R12,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JNE 580880 <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x16d0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
VMOVSD %XMM4,%XMM4,%XMM14 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
JMP 580b24 <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x1974> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
VMOVSD %XMM4,%XMM4,%XMM14 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VXORPD %XMM13,%XMM13,%XMM13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
XOR %R10D,%R10D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 580ad7 <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x1927> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |