Loop Id: 2663 | Module: exec | Source: ams.c:3662-3682 | Coverage: 3.51% |
---|
Loop Id: 2663 | Module: exec | Source: ams.c:3662-3682 | Coverage: 3.51% |
---|
0x4ba47f MOV -0x38(%RBP),%RCX |
0x4ba483 MOV -0x48(%RBP),%RSI |
0x4ba487 MOV -0x30(%RBP),%R9 |
0x4ba48b NOPL (%RAX,%RAX,1) |
0x4ba490 VMULSD %XMM0,%XMM2,%XMM2 |
0x4ba494 MOV -0x78(%RBP),%RAX |
0x4ba498 VDIVSD (%RAX,%R9,8),%XMM2,%XMM2 |
0x4ba49e MOV -0x68(%RBP),%RAX |
0x4ba4a2 VADDSD (%RAX,%R9,8),%XMM2,%XMM2 |
0x4ba4a8 VMOVSD %XMM2,(%RAX,%R9,8) |
(2662) 0x4ba4ae CMP -0x80(%RBP),%RDX |
(2662) 0x4ba4b2 LEA 0x1(%RDX),%RDX |
(2662) 0x4ba4b6 JE 4ba453 |
(2662) 0x4ba4b8 LEA (%RSI,%RDX,1),%R9 |
(2662) 0x4ba4bc MOV (%RCX,%R9,8),%RDI |
(2662) 0x4ba4c0 VUCOMISD (%R14,%RDI,8),%XMM1 |
(2662) 0x4ba4c6 JE 4ba4ae |
0x4ba4c8 MOV -0x70(%RBP),%RAX |
0x4ba4cc VMOVSD (%RAX,%R9,8),%XMM2 |
0x4ba4d2 MOV 0x8(%RCX,%R9,8),%R8 |
0x4ba4d7 SUB %RDI,%R8 |
0x4ba4da MOV %R9,-0x30(%RBP) |
0x4ba4de JLE 4ba6b2 |
0x4ba4e4 CMP $0x8,%R8 |
0x4ba4e8 JB 4ba5af |
0x4ba4ee MOV %R8,%R9 |
0x4ba4f1 SHR $0x3,%R9 |
0x4ba4f5 LEA 0x38(,%RDI,8),%R10 |
0x4ba4fd NOPL (%RAX) |
(2665) 0x4ba500 MOV -0x38(%R12,%R10,1),%R11 |
(2665) 0x4ba505 MOV -0x30(%R12,%R10,1),%RAX |
(2665) 0x4ba50a VMOVSD (%R13,%RAX,8),%XMM3 |
(2665) 0x4ba511 VMULSD -0x30(%R14,%R10,1),%XMM3,%XMM3 |
(2665) 0x4ba518 VMOVSD (%R13,%R11,8),%XMM4 |
(2665) 0x4ba51f VFMADD231SD -0x38(%R14,%R10,1),%XMM4,%XMM3 |
(2665) 0x4ba526 MOV -0x28(%R12,%R10,1),%RAX |
(2665) 0x4ba52b VMOVSD (%R13,%RAX,8),%XMM4 |
(2665) 0x4ba532 VFMADD132SD -0x28(%R14,%R10,1),%XMM3,%XMM4 |
(2665) 0x4ba539 MOV -0x20(%R12,%R10,1),%RAX |
(2665) 0x4ba53e VMOVSD (%R13,%RAX,8),%XMM3 |
(2665) 0x4ba545 VFMADD132SD -0x20(%R14,%R10,1),%XMM4,%XMM3 |
(2665) 0x4ba54c MOV -0x18(%R12,%R10,1),%RAX |
(2665) 0x4ba551 VMOVSD (%R13,%RAX,8),%XMM4 |
(2665) 0x4ba558 VFMADD132SD -0x18(%R14,%R10,1),%XMM3,%XMM4 |
(2665) 0x4ba55f MOV -0x10(%R12,%R10,1),%RAX |
(2665) 0x4ba564 VMOVSD -0x10(%R14,%R10,1),%XMM3 |
(2665) 0x4ba56b MOV -0x8(%R12,%R10,1),%R11 |
(2665) 0x4ba570 VMOVSD -0x8(%R14,%R10,1),%XMM5 |
(2665) 0x4ba577 MOV (%R12,%R10,1),%RCX |
(2665) 0x4ba57b VFMADD132SD (%R13,%RAX,8),%XMM4,%XMM3 |
(2665) 0x4ba582 VFMSUB132SD (%R13,%R11,8),%XMM2,%XMM5 |
(2665) 0x4ba589 VMOVSD (%R14,%R10,1),%XMM2 |
(2665) 0x4ba58f VFMADD231SD (%R13,%RCX,8),%XMM2,%XMM3 |
(2665) 0x4ba596 VADDSD %XMM5,%XMM3,%XMM2 |
(2665) 0x4ba59a VXORPD %XMM3,%XMM3,%XMM3 |
(2665) 0x4ba59e VSUBSD %XMM2,%XMM3,%XMM2 |
(2665) 0x4ba5a2 ADD $0x40,%R10 |
(2665) 0x4ba5a6 DEC %R9 |
(2665) 0x4ba5a9 JNE 4ba500 |
0x4ba5af MOV %R8D,%R9D |
0x4ba5b2 AND $0x7,%R9D |
0x4ba5b6 DEC %R9 |
0x4ba5b9 CMP $0x6,%R9 |
0x4ba5bd JA 4ba605 |
0x4ba605 MOV -0x38(%RBP),%RCX |
0x4ba609 MOV -0x30(%RBP),%R9 |
0x4ba60d JMP 4ba6b2 |
0x4ba6b2 MOV -0x60(%RBP),%RAX |
0x4ba6b6 MOV (%RAX,%R9,8),%RDI |
0x4ba6ba MOV 0x8(%RAX,%R9,8),%R8 |
0x4ba6bf SUB %RDI,%R8 |
0x4ba6c2 JLE 4ba490 |
0x4ba6c8 CMP $0x8,%R8 |
0x4ba6cc JB 4ba792 |
0x4ba6d2 MOV %R8,%R9 |
0x4ba6d5 SHR $0x3,%R9 |
0x4ba6d9 LEA 0x38(,%RDI,8),%R10 |
0x4ba6e1 MOV -0x40(%RBP),%RSI |
0x4ba6e5 NOPW %CS:(%RAX,%RAX,1) |
(2664) 0x4ba6f0 MOV -0x38(%R15,%R10,1),%RAX |
(2664) 0x4ba6f5 MOV -0x30(%R15,%R10,1),%RCX |
(2664) 0x4ba6fa VMOVSD (%RSI,%RCX,8),%XMM3 |
(2664) 0x4ba6ff VMULSD -0x30(%RBX,%R10,1),%XMM3,%XMM3 |
(2664) 0x4ba706 VMOVSD (%RSI,%RAX,8),%XMM4 |
(2664) 0x4ba70b VFMADD231SD -0x38(%RBX,%R10,1),%XMM4,%XMM3 |
(2664) 0x4ba712 MOV -0x28(%R15,%R10,1),%RAX |
(2664) 0x4ba717 VMOVSD (%RSI,%RAX,8),%XMM4 |
(2664) 0x4ba71c VFMADD132SD -0x28(%RBX,%R10,1),%XMM3,%XMM4 |
(2664) 0x4ba723 MOV -0x20(%R15,%R10,1),%RAX |
(2664) 0x4ba728 VMOVSD (%RSI,%RAX,8),%XMM3 |
(2664) 0x4ba72d VFMADD132SD -0x20(%RBX,%R10,1),%XMM4,%XMM3 |
(2664) 0x4ba734 MOV -0x18(%R15,%R10,1),%RAX |
(2664) 0x4ba739 VMOVSD (%RSI,%RAX,8),%XMM4 |
(2664) 0x4ba73e VFMADD132SD -0x18(%RBX,%R10,1),%XMM3,%XMM4 |
(2664) 0x4ba745 MOV -0x10(%R15,%R10,1),%RAX |
(2664) 0x4ba74a VMOVSD -0x10(%RBX,%R10,1),%XMM3 |
(2664) 0x4ba751 MOV -0x8(%R15,%R10,1),%RCX |
(2664) 0x4ba756 VMOVSD -0x8(%RBX,%R10,1),%XMM5 |
(2664) 0x4ba75d MOV (%R15,%R10,1),%R11 |
(2664) 0x4ba761 VFMADD132SD (%RSI,%RAX,8),%XMM4,%XMM3 |
(2664) 0x4ba767 VFMSUB132SD (%RSI,%RCX,8),%XMM2,%XMM5 |
(2664) 0x4ba76d VMOVSD (%RBX,%R10,1),%XMM2 |
(2664) 0x4ba773 VFMADD231SD (%RSI,%R11,8),%XMM2,%XMM3 |
(2664) 0x4ba779 VADDSD %XMM5,%XMM3,%XMM2 |
(2664) 0x4ba77d VXORPD %XMM3,%XMM3,%XMM3 |
(2664) 0x4ba781 VSUBSD %XMM2,%XMM3,%XMM2 |
(2664) 0x4ba785 ADD $0x40,%R10 |
(2664) 0x4ba789 DEC %R9 |
(2664) 0x4ba78c JNE 4ba6f0 |
0x4ba792 MOV %R8D,%R9D |
0x4ba795 AND $0x7,%R9D |
0x4ba799 DEC %R9 |
0x4ba79c CMP $0x6,%R9 |
0x4ba7a0 JA 4ba47f |
/scratch_na/users/xoserete/qaas_runs/171-172-8218/intel/AMG/build/AMG/AMG/parcsr_ls/ams.c: 3662 - 3682 |
-------------------------------------------------------------------------------- |
3662: #pragma omp parallel for private(i,ii,jj,res) HYPRE_SMP_SCHEDULE |
3663: #endif |
3664: for (i = 0; i < n; i++) |
3665: { |
3666: /*----------------------------------------------------------- |
3667: * If diagonal is nonzero, relax point i; otherwise, skip it. |
3668: *-----------------------------------------------------------*/ |
3669: if (A_diag_data[A_diag_i[i]] != zero) |
3670: { |
3671: res = f_data[i]; |
3672: for (jj = A_diag_i[i]; jj < A_diag_i[i+1]; jj++) |
3673: { |
3674: ii = A_diag_j[jj]; |
3675: res -= A_diag_data[jj] * Vtemp_data[ii]; |
3676: } |
3677: for (jj = A_offd_i[i]; jj < A_offd_i[i+1]; jj++) |
3678: { |
3679: ii = A_offd_j[jj]; |
3680: res -= A_offd_data[jj] * Vext_data[ii]; |
3681: } |
3682: u_data[i] += (relax_weight*res)/l1_norms[i]; |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.78 |
CQA speedup if FP arith vectorized | 2.72 |
CQA speedup if fully vectorized | 4.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.50 |
Bottlenecks | micro-operation queue, |
Function | hypre_ParCSRRelaxThreads.extracted.57 |
Source | ams.c:3662-3662,ams.c:3669-3677,ams.c:3682-3682 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 8.00 |
CQA cycles if no scalar integer | 4.50 |
CQA cycles if FP arith vectorized | 2.94 |
CQA cycles if fully vectorized | 2.00 |
Front-end cycles | 8.00 |
DIV/SQRT cycles | 4.50 |
P0 cycles | 3.00 |
P1 cycles | 5.33 |
P2 cycles | 5.33 |
P3 cycles | 1.00 |
P4 cycles | 3.00 |
P5 cycles | 4.50 |
P6 cycles | 1.00 |
P7 cycles | 1.00 |
P8 cycles | 1.00 |
P9 cycles | 3.00 |
P10 cycles | 5.33 |
P11 cycles | 4.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | 8.31 - 8.33 |
Stall cycles (UFS) | 0.00 |
Nb insns | 47.00 |
Nb uops | 46.00 |
Nb loads | 16.00 |
Nb stores | 2.00 |
Nb stack references | 8.00 |
FLOP/cycle | 0.38 |
Nb FLOP add-sub | 1.00 |
Nb FLOP mul | 1.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 1.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 18.00 |
Bytes prefetched | 0.00 |
Bytes loaded | 128.00 |
Bytes stored | 16.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 0.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | 0.00 |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | 0.00 |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 12.50 |
Vector-efficiency ratio load | 12.50 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | 12.50 |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | 12.50 |
Vector-efficiency ratio other | 12.50 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.78 |
CQA speedup if FP arith vectorized | 2.72 |
CQA speedup if fully vectorized | 4.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.50 |
Bottlenecks | micro-operation queue, |
Function | hypre_ParCSRRelaxThreads.extracted.57 |
Source | ams.c:3662-3662,ams.c:3669-3677,ams.c:3682-3682 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 8.00 |
CQA cycles if no scalar integer | 4.50 |
CQA cycles if FP arith vectorized | 2.94 |
CQA cycles if fully vectorized | 2.00 |
Front-end cycles | 8.00 |
DIV/SQRT cycles | 4.50 |
P0 cycles | 3.00 |
P1 cycles | 5.33 |
P2 cycles | 5.33 |
P3 cycles | 1.00 |
P4 cycles | 3.00 |
P5 cycles | 4.50 |
P6 cycles | 1.00 |
P7 cycles | 1.00 |
P8 cycles | 1.00 |
P9 cycles | 3.00 |
P10 cycles | 5.33 |
P11 cycles | 4.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | 8.31 - 8.33 |
Stall cycles (UFS) | 0.00 |
Nb insns | 47.00 |
Nb uops | 46.00 |
Nb loads | 16.00 |
Nb stores | 2.00 |
Nb stack references | 8.00 |
FLOP/cycle | 0.38 |
Nb FLOP add-sub | 1.00 |
Nb FLOP mul | 1.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 1.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 18.00 |
Bytes prefetched | 0.00 |
Bytes loaded | 128.00 |
Bytes stored | 16.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 0.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | 0.00 |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | 0.00 |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 12.50 |
Vector-efficiency ratio load | 12.50 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | 12.50 |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | 12.50 |
Vector-efficiency ratio other | 12.50 |
Path / |
Function | hypre_ParCSRRelaxThreads.extracted.57 |
Source file and lines | ams.c:3662-3682 |
Module | exec |
nb instructions | 47 |
nb uops | 46 |
loop length | 214 |
used x86 registers | 8 |
used mmx registers | 0 |
used xmm registers | 2 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 8 |
ADD-SUB / MUL ratio | 1.00 |
micro-operation queue | 8.00 cycles |
front end | 8.00 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 4.50 | 3.00 | 5.33 | 5.33 | 1.00 | 3.00 | 4.50 | 1.00 | 1.00 | 1.00 | 3.00 | 5.33 |
cycles | 4.50 | 3.00 | 5.33 | 5.33 | 1.00 | 3.00 | 4.50 | 1.00 | 1.00 | 1.00 | 3.00 | 5.33 |
Cycles executing div or sqrt instructions | 4.00 |
FE+BE cycles | 8.31-8.33 |
Stall cycles | 0.00 |
Front-end | 8.00 |
Dispatch | 5.33 |
DIV/SQRT | 4.00 |
Overall L1 | 8.00 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 0% |
other | NA (no other vectorizable/vectorized instructions) |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 0% |
other | 0% |
all | 12% |
load | 12% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 12% |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 12% |
other | NA (no other vectorizable/vectorized instructions) |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 12% |
other | 12% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MOV -0x38(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x48(%RBP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x30(%RBP),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VMULSD %XMM0,%XMM2,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV -0x78(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VDIVSD (%RAX,%R9,8),%XMM2,%XMM2 | 1 | 1 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 13-15 | 4 |
MOV -0x68(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VADDSD (%RAX,%R9,8),%XMM2,%XMM2 | 1 | 0 | 0.50 | 0.33 | 0.33 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.33 | 3 | 0.50 |
VMOVSD %XMM2,(%RAX,%R9,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV -0x70(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD (%RAX,%R9,8),%XMM2 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x8(%RCX,%R9,8),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %RDI,%R8 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %R9,-0x30(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
JLE 4ba6b2 <hypre_ParCSRRelaxThreads.extracted.57+0x312> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
CMP $0x8,%R8 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JB 4ba5af <hypre_ParCSRRelaxThreads.extracted.57+0x20f> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV %R8,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SHR $0x3,%R9 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
LEA 0x38(,%RDI,8),%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %R8D,%R9D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
AND $0x7,%R9D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
DEC %R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CMP $0x6,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JA 4ba605 <hypre_ParCSRRelaxThreads.extracted.57+0x265> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV -0x38(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x30(%RBP),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
JMP 4ba6b2 <hypre_ParCSRRelaxThreads.extracted.57+0x312> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
MOV -0x60(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RAX,%R9,8),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x8(%RAX,%R9,8),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %RDI,%R8 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JLE 4ba490 <hypre_ParCSRRelaxThreads.extracted.57+0xf0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
CMP $0x8,%R8 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JB 4ba792 <hypre_ParCSRRelaxThreads.extracted.57+0x3f2> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV %R8,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SHR $0x3,%R9 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
LEA 0x38(,%RDI,8),%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV -0x40(%RBP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %R8D,%R9D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
AND $0x7,%R9D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
DEC %R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CMP $0x6,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JA 4ba47f <hypre_ParCSRRelaxThreads.extracted.57+0xdf> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
Function | hypre_ParCSRRelaxThreads.extracted.57 |
Source file and lines | ams.c:3662-3682 |
Module | exec |
nb instructions | 47 |
nb uops | 46 |
loop length | 214 |
used x86 registers | 8 |
used mmx registers | 0 |
used xmm registers | 2 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 8 |
ADD-SUB / MUL ratio | 1.00 |
micro-operation queue | 8.00 cycles |
front end | 8.00 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 4.50 | 3.00 | 5.33 | 5.33 | 1.00 | 3.00 | 4.50 | 1.00 | 1.00 | 1.00 | 3.00 | 5.33 |
cycles | 4.50 | 3.00 | 5.33 | 5.33 | 1.00 | 3.00 | 4.50 | 1.00 | 1.00 | 1.00 | 3.00 | 5.33 |
Cycles executing div or sqrt instructions | 4.00 |
FE+BE cycles | 8.31-8.33 |
Stall cycles | 0.00 |
Front-end | 8.00 |
Dispatch | 5.33 |
DIV/SQRT | 4.00 |
Overall L1 | 8.00 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 0% |
other | NA (no other vectorizable/vectorized instructions) |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 0% |
other | 0% |
all | 12% |
load | 12% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 12% |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 12% |
other | NA (no other vectorizable/vectorized instructions) |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 12% |
other | 12% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MOV -0x38(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x48(%RBP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x30(%RBP),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VMULSD %XMM0,%XMM2,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV -0x78(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VDIVSD (%RAX,%R9,8),%XMM2,%XMM2 | 1 | 1 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 13-15 | 4 |
MOV -0x68(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VADDSD (%RAX,%R9,8),%XMM2,%XMM2 | 1 | 0 | 0.50 | 0.33 | 0.33 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.33 | 3 | 0.50 |
VMOVSD %XMM2,(%RAX,%R9,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV -0x70(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD (%RAX,%R9,8),%XMM2 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x8(%RCX,%R9,8),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %RDI,%R8 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %R9,-0x30(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
JLE 4ba6b2 <hypre_ParCSRRelaxThreads.extracted.57+0x312> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
CMP $0x8,%R8 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JB 4ba5af <hypre_ParCSRRelaxThreads.extracted.57+0x20f> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV %R8,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SHR $0x3,%R9 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
LEA 0x38(,%RDI,8),%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %R8D,%R9D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
AND $0x7,%R9D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
DEC %R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CMP $0x6,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JA 4ba605 <hypre_ParCSRRelaxThreads.extracted.57+0x265> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV -0x38(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x30(%RBP),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
JMP 4ba6b2 <hypre_ParCSRRelaxThreads.extracted.57+0x312> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
MOV -0x60(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RAX,%R9,8),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x8(%RAX,%R9,8),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %RDI,%R8 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JLE 4ba490 <hypre_ParCSRRelaxThreads.extracted.57+0xf0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
CMP $0x8,%R8 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JB 4ba792 <hypre_ParCSRRelaxThreads.extracted.57+0x3f2> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV %R8,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SHR $0x3,%R9 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
LEA 0x38(,%RDI,8),%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV -0x40(%RBP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %R8D,%R9D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
AND $0x7,%R9D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
DEC %R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CMP $0x6,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JA 4ba47f <hypre_ParCSRRelaxThreads.extracted.57+0xdf> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |