Function: _Z26clover_unpack_message_leftR16global_variablesiiiiRN6clover8Buffer2DIdEERNS1_8Buffer1DI ... | Module: exec | Source: pack_kernel.cpp:88-92 [...] | Coverage: 0.03% |
---|
Function: _Z26clover_unpack_message_leftR16global_variablesiiiiRN6clover8Buffer2DIdEERNS1_8Buffer1DI ... | Module: exec | Source: pack_kernel.cpp:88-92 [...] | Coverage: 0.03% |
---|
/home/eoseret/qaas_runs_CPU_9468/171-112-9712/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/context.h: 69 - 69 |
-------------------------------------------------------------------------------- |
69: T &operator()(size_t i, size_t j) const { return data[i + j * sizeX]; } |
/home/eoseret/qaas_runs_CPU_9468/171-112-9712/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/pack_kernel.cpp: 88 - 92 |
-------------------------------------------------------------------------------- |
88: #pragma omp parallel for simd |
89: for (int k = (y_min - depth + 1); k < (y_max + y_inc + depth + 2); k++) { |
90: for (int j = 0; j < depth; ++j) { |
91: int index = buffer_offset + j + k * depth; |
92: field(x_min - j, k) = left_rcv[index]; |
0x43bc20 PUSH %RBP |
0x43bc21 MOV %RSP,%RBP |
0x43bc24 PUSH %R15 |
0x43bc26 PUSH %R14 |
0x43bc28 PUSH %R13 |
0x43bc2a PUSH %R12 |
0x43bc2c MOV %RDI,%R12 |
0x43bc2f PUSH %RBX |
0x43bc30 SUB $0x8,%RSP |
0x43bc34 MOV 0x1c(%RDI),%EBX |
0x43bc37 MOV 0x14(%RDI),%R8D |
0x43bc3b SUB %EBX,%R8D |
0x43bc3e LEA 0x1(%R8),%R14D |
0x43bc42 CALL 404650 <omp_get_num_threads@plt> |
0x43bc47 MOV %EAX,%R13D |
0x43bc4a CALL 404540 <omp_get_thread_num@plt> |
0x43bc4f MOV %EAX,%ECX |
0x43bc51 MOV 0x18(%R12),%EAX |
0x43bc56 ADD 0x24(%R12),%EAX |
0x43bc5b LEA 0x2(%RBX,%RAX,1),%EAX |
0x43bc5f SUB %R14D,%EAX |
0x43bc62 CLTD |
0x43bc63 IDIV %R13D |
0x43bc66 CMP %EDX,%ECX |
0x43bc68 JL 43be26 |
0x43bc6e IMUL %EAX,%ECX |
0x43bc71 ADD %ECX,%EDX |
0x43bc73 ADD %EDX,%EAX |
0x43bc75 CMP %EAX,%EDX |
0x43bc77 JGE 43bdfe |
0x43bc7d ADD %R14D,%EDX |
0x43bc80 MOV %EBX,%R9D |
0x43bc83 MOVSXD %EBX,%R11 |
0x43bc86 MOV 0x8(%R12),%R15 |
0x43bc8b IMUL %EDX,%R9D |
0x43bc8f MOV (%R12),%R10 |
0x43bc93 MOVSXD 0x10(%R12),%R13 |
0x43bc98 LEA (%R14,%RAX,1),%R8D |
0x43bc9c SAL $0x3,%R11 |
0x43bca0 MOVSXD 0x20(%R12),%R14 |
0x43bca5 MOVSXD %EDX,%RDI |
0x43bca8 XOR %R12D,%R12D |
0x43bcab NOPL (%RAX,%RAX,1) |
(257) 0x43bcb0 TEST %EBX,%EBX |
(257) 0x43bcb2 JLE 43bdeb |
(257) 0x43bcb8 MOV 0x8(%R15),%RAX |
(257) 0x43bcbc MOVSXD %R9D,%RCX |
(257) 0x43bcbf MOV 0x10(%R10),%RSI |
(257) 0x43bcc3 ADD %R14,%RCX |
(257) 0x43bcc6 LEA (%RAX,%RCX,8),%RDX |
(257) 0x43bcca MOV (%R10),%RCX |
(257) 0x43bccd IMUL %RDI,%RCX |
(257) 0x43bcd1 ADD %R13,%RCX |
(257) 0x43bcd4 SAL $0x3,%RCX |
(257) 0x43bcd8 LEA (%RSI,%RCX,1),%RAX |
(257) 0x43bcdc SUB %R11,%RSI |
(257) 0x43bcdf ADD %RCX,%RSI |
(257) 0x43bce2 MOV %RAX,%RCX |
(257) 0x43bce5 SUB %RSI,%RCX |
(257) 0x43bce8 SUB $0x8,%RCX |
(257) 0x43bcec SHR $0x3,%RCX |
(257) 0x43bcf0 INC %RCX |
(257) 0x43bcf3 AND $0x7,%ECX |
(257) 0x43bcf6 JE 43bd8f |
(257) 0x43bcfc CMP $0x1,%RCX |
(257) 0x43bd00 JE 43bd79 |
(257) 0x43bd02 CMP $0x2,%RCX |
(257) 0x43bd06 JE 43bd68 |
(257) 0x43bd08 CMP $0x3,%RCX |
(257) 0x43bd0c JE 43bd57 |
(257) 0x43bd0e CMP $0x4,%RCX |
(257) 0x43bd12 JE 43bd46 |
(257) 0x43bd14 CMP $0x5,%RCX |
(257) 0x43bd18 JE 43bd35 |
(257) 0x43bd1a CMP $0x6,%RCX |
(257) 0x43bd1e JNE 43be10 |
(257) 0x43bd24 VMOVSD (%RDX),%XMM1 |
(257) 0x43bd28 SUB $0x8,%RAX |
(257) 0x43bd2c ADD $0x8,%RDX |
(257) 0x43bd30 VMOVSD %XMM1,0x8(%RAX) |
(257) 0x43bd35 VMOVSD (%RDX),%XMM2 |
(257) 0x43bd39 SUB $0x8,%RAX |
(257) 0x43bd3d ADD $0x8,%RDX |
(257) 0x43bd41 VMOVSD %XMM2,0x8(%RAX) |
(257) 0x43bd46 VMOVSD (%RDX),%XMM3 |
(257) 0x43bd4a SUB $0x8,%RAX |
(257) 0x43bd4e ADD $0x8,%RDX |
(257) 0x43bd52 VMOVSD %XMM3,0x8(%RAX) |
(257) 0x43bd57 VMOVSD (%RDX),%XMM4 |
(257) 0x43bd5b SUB $0x8,%RAX |
(257) 0x43bd5f ADD $0x8,%RDX |
(257) 0x43bd63 VMOVSD %XMM4,0x8(%RAX) |
(257) 0x43bd68 VMOVSD (%RDX),%XMM5 |
(257) 0x43bd6c SUB $0x8,%RAX |
(257) 0x43bd70 ADD $0x8,%RDX |
(257) 0x43bd74 VMOVSD %XMM5,0x8(%RAX) |
(257) 0x43bd79 VMOVSD (%RDX),%XMM6 |
(257) 0x43bd7d SUB $0x8,%RAX |
(257) 0x43bd81 ADD $0x8,%RDX |
(257) 0x43bd85 VMOVSD %XMM6,0x8(%RAX) |
(257) 0x43bd8a CMP %RSI,%RAX |
(257) 0x43bd8d JE 43bdeb |
(258) 0x43bd8f VMOVSD (%RDX),%XMM7 |
(258) 0x43bd93 SUB $0x40,%RAX |
(258) 0x43bd97 ADD $0x40,%RDX |
(258) 0x43bd9b VMOVSD %XMM7,0x40(%RAX) |
(258) 0x43bda0 VMOVSD -0x38(%RDX),%XMM8 |
(258) 0x43bda5 VMOVSD %XMM8,0x38(%RAX) |
(258) 0x43bdaa VMOVSD -0x30(%RDX),%XMM9 |
(258) 0x43bdaf VMOVSD %XMM9,0x30(%RAX) |
(258) 0x43bdb4 VMOVSD -0x28(%RDX),%XMM10 |
(258) 0x43bdb9 VMOVSD %XMM10,0x28(%RAX) |
(258) 0x43bdbe VMOVSD -0x20(%RDX),%XMM11 |
(258) 0x43bdc3 VMOVSD %XMM11,0x20(%RAX) |
(258) 0x43bdc8 VMOVSD -0x18(%RDX),%XMM12 |
(258) 0x43bdcd VMOVSD %XMM12,0x18(%RAX) |
(258) 0x43bdd2 VMOVSD -0x10(%RDX),%XMM13 |
(258) 0x43bdd7 VMOVSD %XMM13,0x10(%RAX) |
(258) 0x43bddc VMOVSD -0x8(%RDX),%XMM14 |
(258) 0x43bde1 VMOVSD %XMM14,0x8(%RAX) |
(258) 0x43bde6 CMP %RSI,%RAX |
(258) 0x43bde9 JNE 43bd8f |
(257) 0x43bdeb INC %RDI |
(257) 0x43bdee ADD %EBX,%R9D |
(257) 0x43bdf1 LEA (%R12,%RDI,1),%ESI |
(257) 0x43bdf5 CMP %ESI,%R8D |
(257) 0x43bdf8 JG 43bcb0 |
0x43bdfe ADD $0x8,%RSP |
0x43be02 POP %RBX |
0x43be03 POP %R12 |
0x43be05 POP %R13 |
0x43be07 POP %R14 |
0x43be09 POP %R15 |
0x43be0b POP %RBP |
0x43be0c RET |
0x43be0d NOPL (%RAX) |
(257) 0x43be10 VMOVSD (%RDX),%XMM0 |
(257) 0x43be14 SUB $0x8,%RAX |
(257) 0x43be18 ADD $0x8,%RDX |
(257) 0x43be1c VMOVSD %XMM0,0x8(%RAX) |
(257) 0x43be21 JMP 43bd24 |
0x43be26 INC %EAX |
0x43be28 XOR %EDX,%EDX |
0x43be2a JMP 43bc6e |
0x43be2f NOP |
Path / |
Source file and lines | pack_kernel.cpp:88-92 |
Module | exec |
nb instructions | 56 |
nb uops | 61 |
loop length | 172 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 0 |
micro-operation queue | 10.17 cycles |
front end | 10.17 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 4.50 | 6.00 | 5.00 | 5.00 | 4.00 | 4.60 | 4.50 | 4.00 | 4.00 | 4.00 | 4.40 | 5.00 |
cycles | 4.50 | 8.87 | 5.00 | 5.00 | 4.00 | 4.60 | 4.50 | 4.00 | 4.00 | 4.00 | 4.40 | 5.00 |
Cycles executing div or sqrt instructions | 6.00 |
FE+BE cycles | 17.82-17.63 |
Stall cycles | 7.78-7.59 |
LM full (events) | 9.39-9.20 |
Front-end | 10.17 |
Dispatch | 8.87 |
DIV/SQRT | 6.00 |
Overall L1 | 10.17 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 0% |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 0% |
other | 0% |
all | 8% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 6% |
add-sub | 9% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 6% |
other | 8% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R15 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R14 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R13 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R12 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RDI,%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %RBX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
SUB $0x8,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV 0x1c(%RDI),%EBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x14(%RDI),%R8D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %EBX,%R8D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
LEA 0x1(%R8),%R14D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
CALL 404650 <omp_get_num_threads@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
MOV %EAX,%R13D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CALL 404540 <omp_get_thread_num@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
MOV %EAX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV 0x18(%R12),%EAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD 0x24(%R12),%EAX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
LEA 0x2(%RBX,%RAX,1),%EAX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
SUB %R14D,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CLTD | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
IDIV %R13D | 4 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11-16 | 6 |
CMP %EDX,%ECX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JL 43be26 <_Z26clover_unpack_message_leftR16global_variablesiiiiRN6clover8Buffer2DIdEERNS1_8Buffer1DIdEEiiiiiii._omp_fn.0+0x206> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
IMUL %EAX,%ECX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD %ECX,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD %EDX,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP %EAX,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JGE 43bdfe <_Z26clover_unpack_message_leftR16global_variablesiiiiRN6clover8Buffer2DIdEERNS1_8Buffer1DIdEEiiiiiii._omp_fn.0+0x1de> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
ADD %R14D,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %EBX,%R9D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOVSXD %EBX,%R11 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
MOV 0x8(%R12),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
IMUL %EDX,%R9D | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV (%R12),%R10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVSXD 0x10(%R12),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%R14,%RAX,1),%R8D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
SAL $0x3,%R11 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
MOVSXD 0x20(%R12),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVSXD %EDX,%RDI | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
XOR %R12D,%R12D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD $0x8,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
POP %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
RET | 1 | 0.50 | 0 | 0.33 | 0.33 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0.33 | 0 | 2.13 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
INC %EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 43bc6e <_Z26clover_unpack_message_leftR16global_variablesiiiiRN6clover8Buffer2DIdEERNS1_8Buffer1DIdEEiiiiiii._omp_fn.0+0x4e> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
NOP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
Source file and lines | pack_kernel.cpp:88-92 |
Module | exec |
nb instructions | 56 |
nb uops | 61 |
loop length | 172 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 0 |
micro-operation queue | 10.17 cycles |
front end | 10.17 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 4.50 | 6.00 | 5.00 | 5.00 | 4.00 | 4.60 | 4.50 | 4.00 | 4.00 | 4.00 | 4.40 | 5.00 |
cycles | 4.50 | 8.87 | 5.00 | 5.00 | 4.00 | 4.60 | 4.50 | 4.00 | 4.00 | 4.00 | 4.40 | 5.00 |
Cycles executing div or sqrt instructions | 6.00 |
FE+BE cycles | 17.82-17.63 |
Stall cycles | 7.78-7.59 |
LM full (events) | 9.39-9.20 |
Front-end | 10.17 |
Dispatch | 8.87 |
DIV/SQRT | 6.00 |
Overall L1 | 10.17 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 0% |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 0% |
other | 0% |
all | 8% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 6% |
add-sub | 9% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 6% |
other | 8% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R15 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R14 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R13 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R12 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RDI,%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %RBX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
SUB $0x8,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV 0x1c(%RDI),%EBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x14(%RDI),%R8D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %EBX,%R8D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
LEA 0x1(%R8),%R14D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
CALL 404650 <omp_get_num_threads@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
MOV %EAX,%R13D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CALL 404540 <omp_get_thread_num@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
MOV %EAX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV 0x18(%R12),%EAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD 0x24(%R12),%EAX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
LEA 0x2(%RBX,%RAX,1),%EAX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
SUB %R14D,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CLTD | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
IDIV %R13D | 4 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11-16 | 6 |
CMP %EDX,%ECX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JL 43be26 <_Z26clover_unpack_message_leftR16global_variablesiiiiRN6clover8Buffer2DIdEERNS1_8Buffer1DIdEEiiiiiii._omp_fn.0+0x206> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
IMUL %EAX,%ECX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD %ECX,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD %EDX,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP %EAX,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JGE 43bdfe <_Z26clover_unpack_message_leftR16global_variablesiiiiRN6clover8Buffer2DIdEERNS1_8Buffer1DIdEEiiiiiii._omp_fn.0+0x1de> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
ADD %R14D,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %EBX,%R9D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOVSXD %EBX,%R11 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
MOV 0x8(%R12),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
IMUL %EDX,%R9D | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV (%R12),%R10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVSXD 0x10(%R12),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%R14,%RAX,1),%R8D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
SAL $0x3,%R11 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
MOVSXD 0x20(%R12),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVSXD %EDX,%RDI | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
XOR %R12D,%R12D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD $0x8,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
POP %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
RET | 1 | 0.50 | 0 | 0.33 | 0.33 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0.33 | 0 | 2.13 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
INC %EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 43bc6e <_Z26clover_unpack_message_leftR16global_variablesiiiiRN6clover8Buffer2DIdEERNS1_8Buffer1DIdEEiiiiiii._omp_fn.0+0x4e> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
NOP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
Name | Coverage (%) | Time (s) |
---|---|---|
▼_Z26clover_unpack_message_leftR16global_variablesiiiiRN6clover8Buffer2DIdEERNS1_8Buffer1DIdEEiiiiiii._omp_fn.0– | 0.03 | 0.01 |
▼Loop 257 - pack_kernel.cpp:90-92 - exec– | 0.03 | 0.02 |
○Loop 258 - pack_kernel.cpp:90-92 - exec | 0 | 0 |