Function: _Z24clover_pack_message_leftR16global_variablesiiiiRN6clover8Buffer2DIdEERNS1_8Buffer1DIdE ... | Module: exec | Source: pack_kernel.cpp:55-59 [...] | Coverage: 0.02% |
---|
Function: _Z24clover_pack_message_leftR16global_variablesiiiiRN6clover8Buffer2DIdEERNS1_8Buffer1DIdE ... | Module: exec | Source: pack_kernel.cpp:55-59 [...] | Coverage: 0.02% |
---|
/home/eoseret/qaas_runs_CPU_9468/171-112-9712/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/context.h: 69 - 69 |
-------------------------------------------------------------------------------- |
69: T &operator()(size_t i, size_t j) const { return data[i + j * sizeX]; } |
/home/eoseret/qaas_runs_CPU_9468/171-112-9712/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/pack_kernel.cpp: 55 - 59 |
-------------------------------------------------------------------------------- |
55: #pragma omp parallel for simd |
56: for (int k = (y_min - depth + 1); k < (y_max + y_inc + depth + 2); k++) { |
57: for (int j = 0; j < depth; ++j) { |
58: int index = buffer_offset + j + k * depth; |
59: left_snd[index] = field(x_min + x_inc - 1 + j + 2, k); |
0x43ba20 PUSH %RBP |
0x43ba21 MOV %RSP,%RBP |
0x43ba24 PUSH %R15 |
0x43ba26 PUSH %R14 |
0x43ba28 PUSH %R13 |
0x43ba2a MOV %RDI,%R13 |
0x43ba2d PUSH %R12 |
0x43ba2f PUSH %RBX |
0x43ba30 SUB $0x8,%RSP |
0x43ba34 MOV 0x1c(%RDI),%EBX |
0x43ba37 MOV 0x14(%RDI),%R8D |
0x43ba3b SUB %EBX,%R8D |
0x43ba3e LEA 0x1(%R8),%R14D |
0x43ba42 CALL 404650 <omp_get_num_threads@plt> |
0x43ba47 MOV %EAX,%R12D |
0x43ba4a CALL 404540 <omp_get_thread_num@plt> |
0x43ba4f MOV %EAX,%ECX |
0x43ba51 MOV 0x18(%R13),%EAX |
0x43ba55 ADD 0x28(%R13),%EAX |
0x43ba59 LEA 0x2(%RBX,%RAX,1),%EAX |
0x43ba5d SUB %R14D,%EAX |
0x43ba60 CLTD |
0x43ba61 IDIV %R12D |
0x43ba64 CMP %EDX,%ECX |
0x43ba66 JL 43bc13 |
0x43ba6c IMUL %EAX,%ECX |
0x43ba6f ADD %ECX,%EDX |
0x43ba71 ADD %EDX,%EAX |
0x43ba73 CMP %EAX,%EDX |
0x43ba75 JGE 43bbf1 |
0x43ba7b ADD %R14D,%EDX |
0x43ba7e MOV 0x10(%R13),%ESI |
0x43ba82 MOVSXD %EBX,%R10 |
0x43ba85 MOV (%R13),%R11 |
0x43ba89 MOV %EDX,%R9D |
0x43ba8c MOV 0x8(%R13),%R15 |
0x43ba90 LEA (%R14,%RAX,1),%R8D |
0x43ba94 SAL $0x3,%R10 |
0x43ba98 IMUL %EBX,%R9D |
0x43ba9c ADD 0x24(%R13),%ESI |
0x43baa0 MOVSXD 0x20(%R13),%R14 |
0x43baa4 MOVSXD %EDX,%RDI |
0x43baa7 MOVSXD %ESI,%R12 |
0x43baaa XOR %R13D,%R13D |
0x43baad NOPL (%RAX) |
(255) 0x43bab0 TEST %EBX,%EBX |
(255) 0x43bab2 JLE 43bbdd |
(255) 0x43bab8 MOV (%R11),%RCX |
(255) 0x43babb MOV 0x8(%R15),%RDX |
(255) 0x43babf MOVSXD %R9D,%RSI |
(255) 0x43bac2 ADD %R14,%RSI |
(255) 0x43bac5 MOV 0x10(%R11),%RAX |
(255) 0x43bac9 IMUL %RDI,%RCX |
(255) 0x43bacd LEA (%RDX,%RSI,8),%RDX |
(255) 0x43bad1 LEA -0x8(%R10),%RSI |
(255) 0x43bad5 SHR $0x3,%RSI |
(255) 0x43bad9 ADD %R12,%RCX |
(255) 0x43badc INC %RSI |
(255) 0x43badf LEA (%RAX,%RCX,8),%RCX |
(255) 0x43bae3 XOR %EAX,%EAX |
(255) 0x43bae5 AND $0x7,%ESI |
(255) 0x43bae8 JE 43bb75 |
(255) 0x43baee CMP $0x1,%RSI |
(255) 0x43baf2 JE 43bb61 |
(255) 0x43baf4 CMP $0x2,%RSI |
(255) 0x43baf8 JE 43bb52 |
(255) 0x43bafa CMP $0x3,%RSI |
(255) 0x43bafe JE 43bb43 |
(255) 0x43bb00 CMP $0x4,%RSI |
(255) 0x43bb04 JE 43bb34 |
(255) 0x43bb06 CMP $0x5,%RSI |
(255) 0x43bb0a JE 43bb25 |
(255) 0x43bb0c CMP $0x6,%RSI |
(255) 0x43bb10 JNE 43bc00 |
(255) 0x43bb16 VMOVSD 0x8(%RCX,%RAX,1),%XMM1 |
(255) 0x43bb1c VMOVSD %XMM1,(%RDX,%RAX,1) |
(255) 0x43bb21 ADD $0x8,%RAX |
(255) 0x43bb25 VMOVSD 0x8(%RCX,%RAX,1),%XMM2 |
(255) 0x43bb2b VMOVSD %XMM2,(%RDX,%RAX,1) |
(255) 0x43bb30 ADD $0x8,%RAX |
(255) 0x43bb34 VMOVSD 0x8(%RCX,%RAX,1),%XMM3 |
(255) 0x43bb3a VMOVSD %XMM3,(%RDX,%RAX,1) |
(255) 0x43bb3f ADD $0x8,%RAX |
(255) 0x43bb43 VMOVSD 0x8(%RCX,%RAX,1),%XMM4 |
(255) 0x43bb49 VMOVSD %XMM4,(%RDX,%RAX,1) |
(255) 0x43bb4e ADD $0x8,%RAX |
(255) 0x43bb52 VMOVSD 0x8(%RCX,%RAX,1),%XMM5 |
(255) 0x43bb58 VMOVSD %XMM5,(%RDX,%RAX,1) |
(255) 0x43bb5d ADD $0x8,%RAX |
(255) 0x43bb61 VMOVSD 0x8(%RCX,%RAX,1),%XMM6 |
(255) 0x43bb67 VMOVSD %XMM6,(%RDX,%RAX,1) |
(255) 0x43bb6c ADD $0x8,%RAX |
(255) 0x43bb70 CMP %RAX,%R10 |
(255) 0x43bb73 JE 43bbdd |
(256) 0x43bb75 VMOVSD 0x8(%RCX,%RAX,1),%XMM7 |
(256) 0x43bb7b VMOVSD %XMM7,(%RDX,%RAX,1) |
(256) 0x43bb80 VMOVSD 0x10(%RCX,%RAX,1),%XMM8 |
(256) 0x43bb86 VMOVSD %XMM8,0x8(%RDX,%RAX,1) |
(256) 0x43bb8c VMOVSD 0x18(%RCX,%RAX,1),%XMM9 |
(256) 0x43bb92 VMOVSD %XMM9,0x10(%RDX,%RAX,1) |
(256) 0x43bb98 VMOVSD 0x20(%RCX,%RAX,1),%XMM10 |
(256) 0x43bb9e VMOVSD %XMM10,0x18(%RDX,%RAX,1) |
(256) 0x43bba4 VMOVSD 0x28(%RCX,%RAX,1),%XMM11 |
(256) 0x43bbaa VMOVSD %XMM11,0x20(%RDX,%RAX,1) |
(256) 0x43bbb0 VMOVSD 0x30(%RCX,%RAX,1),%XMM12 |
(256) 0x43bbb6 VMOVSD %XMM12,0x28(%RDX,%RAX,1) |
(256) 0x43bbbc VMOVSD 0x38(%RCX,%RAX,1),%XMM13 |
(256) 0x43bbc2 VMOVSD %XMM13,0x30(%RDX,%RAX,1) |
(256) 0x43bbc8 VMOVSD 0x40(%RCX,%RAX,1),%XMM14 |
(256) 0x43bbce VMOVSD %XMM14,0x38(%RDX,%RAX,1) |
(256) 0x43bbd4 ADD $0x40,%RAX |
(256) 0x43bbd8 CMP %RAX,%R10 |
(256) 0x43bbdb JNE 43bb75 |
(255) 0x43bbdd INC %RDI |
(255) 0x43bbe0 ADD %EBX,%R9D |
(255) 0x43bbe3 LEA (%R13,%RDI,1),%ECX |
(255) 0x43bbe8 CMP %ECX,%R8D |
(255) 0x43bbeb JG 43bab0 |
0x43bbf1 ADD $0x8,%RSP |
0x43bbf5 POP %RBX |
0x43bbf6 POP %R12 |
0x43bbf8 POP %R13 |
0x43bbfa POP %R14 |
0x43bbfc POP %R15 |
0x43bbfe POP %RBP |
0x43bbff RET |
(255) 0x43bc00 VMOVSD 0x8(%RCX),%XMM0 |
(255) 0x43bc05 MOV $0x8,%EAX |
(255) 0x43bc0a VMOVSD %XMM0,(%RDX) |
(255) 0x43bc0e JMP 43bb16 |
0x43bc13 INC %EAX |
0x43bc15 XOR %EDX,%EDX |
0x43bc17 JMP 43ba6c |
0x43bc1c NOPL (%RAX) |
Path / |
Source file and lines | pack_kernel.cpp:55-59 |
Module | exec |
nb instructions | 57 |
nb uops | 62 |
loop length | 172 |
used x86 registers | 16 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 0 |
micro-operation queue | 10.33 cycles |
front end | 10.33 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 5.10 | 6.00 | 5.33 | 5.33 | 4.00 | 5.07 | 4.90 | 4.00 | 4.00 | 4.00 | 4.93 | 5.33 |
cycles | 5.10 | 9.40 | 5.33 | 5.33 | 4.00 | 5.07 | 4.90 | 4.00 | 4.00 | 4.00 | 4.93 | 5.33 |
Cycles executing div or sqrt instructions | 6.00 |
FE+BE cycles | 18.15-17.99 |
Stall cycles | 7.96-7.80 |
LM full (events) | 8.77-8.61 |
Front-end | 10.33 |
Dispatch | 9.40 |
DIV/SQRT | 6.00 |
Overall L1 | 10.33 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 0% |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 0% |
other | 0% |
all | 8% |
load | 9% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 6% |
add-sub | 8% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 6% |
other | 8% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R15 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R14 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R13 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RDI,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R12 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RBX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
SUB $0x8,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV 0x1c(%RDI),%EBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x14(%RDI),%R8D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %EBX,%R8D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
LEA 0x1(%R8),%R14D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
CALL 404650 <omp_get_num_threads@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
MOV %EAX,%R12D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CALL 404540 <omp_get_thread_num@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
MOV %EAX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV 0x18(%R13),%EAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD 0x28(%R13),%EAX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
LEA 0x2(%RBX,%RAX,1),%EAX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
SUB %R14D,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CLTD | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
IDIV %R12D | 4 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11-16 | 6 |
CMP %EDX,%ECX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JL 43bc13 <_Z24clover_pack_message_leftR16global_variablesiiiiRN6clover8Buffer2DIdEERNS1_8Buffer1DIdEEiiiiiii._omp_fn.0+0x1f3> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
IMUL %EAX,%ECX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD %ECX,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD %EDX,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP %EAX,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JGE 43bbf1 <_Z24clover_pack_message_leftR16global_variablesiiiiRN6clover8Buffer2DIdEERNS1_8Buffer1DIdEEiiiiiii._omp_fn.0+0x1d1> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
ADD %R14D,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV 0x10(%R13),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVSXD %EBX,%R10 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
MOV (%R13),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %EDX,%R9D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV 0x8(%R13),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%R14,%RAX,1),%R8D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
SAL $0x3,%R10 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
IMUL %EBX,%R9D | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD 0x24(%R13),%ESI | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
MOVSXD 0x20(%R13),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVSXD %EDX,%RDI | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
MOVSXD %ESI,%R12 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
XOR %R13D,%R13D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD $0x8,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
POP %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
RET | 1 | 0.50 | 0 | 0.33 | 0.33 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0.33 | 0 | 2.13 |
INC %EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 43ba6c <_Z24clover_pack_message_leftR16global_variablesiiiiRN6clover8Buffer2DIdEERNS1_8Buffer1DIdEEiiiiiii._omp_fn.0+0x4c> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
Source file and lines | pack_kernel.cpp:55-59 |
Module | exec |
nb instructions | 57 |
nb uops | 62 |
loop length | 172 |
used x86 registers | 16 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 0 |
micro-operation queue | 10.33 cycles |
front end | 10.33 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 5.10 | 6.00 | 5.33 | 5.33 | 4.00 | 5.07 | 4.90 | 4.00 | 4.00 | 4.00 | 4.93 | 5.33 |
cycles | 5.10 | 9.40 | 5.33 | 5.33 | 4.00 | 5.07 | 4.90 | 4.00 | 4.00 | 4.00 | 4.93 | 5.33 |
Cycles executing div or sqrt instructions | 6.00 |
FE+BE cycles | 18.15-17.99 |
Stall cycles | 7.96-7.80 |
LM full (events) | 8.77-8.61 |
Front-end | 10.33 |
Dispatch | 9.40 |
DIV/SQRT | 6.00 |
Overall L1 | 10.33 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 0% |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 0% |
other | 0% |
all | 8% |
load | 9% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 6% |
add-sub | 8% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 6% |
other | 8% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R15 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R14 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R13 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RDI,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R12 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RBX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
SUB $0x8,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV 0x1c(%RDI),%EBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x14(%RDI),%R8D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %EBX,%R8D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
LEA 0x1(%R8),%R14D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
CALL 404650 <omp_get_num_threads@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
MOV %EAX,%R12D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CALL 404540 <omp_get_thread_num@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
MOV %EAX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV 0x18(%R13),%EAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD 0x28(%R13),%EAX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
LEA 0x2(%RBX,%RAX,1),%EAX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
SUB %R14D,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CLTD | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
IDIV %R12D | 4 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11-16 | 6 |
CMP %EDX,%ECX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JL 43bc13 <_Z24clover_pack_message_leftR16global_variablesiiiiRN6clover8Buffer2DIdEERNS1_8Buffer1DIdEEiiiiiii._omp_fn.0+0x1f3> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
IMUL %EAX,%ECX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD %ECX,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD %EDX,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP %EAX,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JGE 43bbf1 <_Z24clover_pack_message_leftR16global_variablesiiiiRN6clover8Buffer2DIdEERNS1_8Buffer1DIdEEiiiiiii._omp_fn.0+0x1d1> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
ADD %R14D,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV 0x10(%R13),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVSXD %EBX,%R10 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
MOV (%R13),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %EDX,%R9D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV 0x8(%R13),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%R14,%RAX,1),%R8D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
SAL $0x3,%R10 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
IMUL %EBX,%R9D | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD 0x24(%R13),%ESI | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
MOVSXD 0x20(%R13),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVSXD %EDX,%RDI | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
MOVSXD %ESI,%R12 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
XOR %R13D,%R13D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD $0x8,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
POP %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
RET | 1 | 0.50 | 0 | 0.33 | 0.33 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0.33 | 0 | 2.13 |
INC %EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 43ba6c <_Z24clover_pack_message_leftR16global_variablesiiiiRN6clover8Buffer2DIdEERNS1_8Buffer1DIdEEiiiiiii._omp_fn.0+0x4c> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
Name | Coverage (%) | Time (s) |
---|---|---|
▼_Z24clover_pack_message_leftR16global_variablesiiiiRN6clover8Buffer2DIdEERNS1_8Buffer1DIdEEiiiiiii._omp_fn.0– | 0.02 | 0.01 |
▼Loop 255 - pack_kernel.cpp:57-59 - exec– | 0.02 | 0.01 |
○Loop 256 - pack_kernel.cpp:57-59 - exec | 0 | 0 |