Function: update_halo_kernel(int, int, int, int, std::array<int, 4ul> const&, std::array<int, 4ul> c ... | Module: exec | Source: update_halo.cpp:183-186 [...] | Coverage: 0.01% |
---|
Function: update_halo_kernel(int, int, int, int, std::array<int, 4ul> const&, std::array<int, 4ul> c ... | Module: exec | Source: update_halo.cpp:183-186 [...] | Coverage: 0.01% |
---|
/scratch_na/users/xoserete/qaas_runs/171-415-4969/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/update_halo.cpp: 183 - 186 |
-------------------------------------------------------------------------------- |
183: #pragma omp parallel for simd |
184: for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { |
185: for (int k = 0; k < depth; ++k) { |
186: field.energy1(j, y_max + 2 + k) = field.energy1(j, y_max + 1 - k); |
/scratch_na/users/xoserete/qaas_runs/171-415-4969/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/context.h: 69 - 69 |
-------------------------------------------------------------------------------- |
69: T &operator()(size_t i, size_t j) const { return data[i + j * sizeX]; } |
0x442010 PUSH %RBP |
0x442011 MOV %RSP,%RBP |
0x442014 PUSH %R14 |
0x442016 PUSH %R13 |
0x442018 MOV %RDI,%R13 |
0x44201b PUSH %R12 |
0x44201d PUSH %RBX |
0x44201e MOV 0x14(%RDI),%EBX |
0x442021 MOV 0x8(%RDI),%EDI |
0x442024 SUB %EBX,%EDI |
0x442026 LEA 0x1(%RDI),%R14D |
0x44202a CALL 4046c0 <omp_get_num_threads@plt> |
0x44202f MOV %EAX,%R12D |
0x442032 CALL 4045b0 <omp_get_thread_num@plt> |
0x442037 MOV %EAX,%ECX |
0x442039 MOV 0xc(%R13),%EAX |
0x44203d ADD %EBX,%EAX |
0x44203f ADD $0x2,%EAX |
0x442042 SUB %R14D,%EAX |
0x442045 CLTD |
0x442046 IDIV %R12D |
0x442049 CMP %EDX,%ECX |
0x44204b JL 44221f |
0x442051 IMUL %EAX,%ECX |
0x442054 ADD %ECX,%EDX |
0x442056 ADD %EDX,%EAX |
0x442058 CMP %EAX,%EDX |
0x44205a JGE 442216 |
0x442060 MOV 0x10(%R13),%R8D |
0x442064 MOV (%R13),%R10 |
0x442068 LEA (%R14,%RDX,1),%ESI |
0x44206c LEA (%R14,%RAX,1),%EDI |
0x442070 TEST %EBX,%EBX |
0x442072 JLE 442216 |
0x442078 MOV 0x48(%R10),%R14 |
0x44207c LEA 0x1(%R8),%R11D |
0x442080 MOV 0x58(%R10),%R9 |
0x442084 MOVSXD %ESI,%RSI |
0x442087 MOVSXD %R11D,%R8 |
0x44208a XOR %R10D,%R10D |
0x44208d IMUL %R14,%R8 |
0x442091 MOV %R14,%RDX |
0x442094 NEG %RDX |
0x442097 SAL $0x3,%RDX |
0x44209b ADD %R8,%R14 |
0x44209e XCHG %AX,%AX |
(272) 0x4420a0 LEA (%RSI,%R8,1),%R13 |
(272) 0x4420a4 LEA (%RSI,%R14,1),%RCX |
(272) 0x4420a8 MOV %EBX,%R11D |
(272) 0x4420ab LEA (%R9,%R13,8),%R12 |
(272) 0x4420af LEA (%R9,%RCX,8),%RAX |
(272) 0x4420b3 XOR %R13D,%R13D |
(272) 0x4420b6 AND $0x7,%R11D |
(272) 0x4420ba JE 442179 |
(272) 0x4420c0 CMP $0x1,%R11D |
(272) 0x4420c4 JE 44215d |
(272) 0x4420ca CMP $0x2,%R11D |
(272) 0x4420ce JE 44214a |
(272) 0x4420d0 CMP $0x3,%R11D |
(272) 0x4420d4 JE 442137 |
(272) 0x4420d6 CMP $0x4,%R11D |
(272) 0x4420da JE 442124 |
(272) 0x4420dc CMP $0x5,%R11D |
(272) 0x4420e0 JE 442111 |
(272) 0x4420e2 CMP $0x6,%R11D |
(272) 0x4420e6 JE 4420fe |
(272) 0x4420e8 VMOVSD (%R12),%XMM0 |
(272) 0x4420ee MOV $0x1,%R13D |
(272) 0x4420f4 ADD %RDX,%R12 |
(272) 0x4420f7 VMOVSD %XMM0,(%RAX) |
(272) 0x4420fb SUB %RDX,%RAX |
(272) 0x4420fe VMOVSD (%R12),%XMM1 |
(272) 0x442104 INC %R13D |
(272) 0x442107 ADD %RDX,%R12 |
(272) 0x44210a VMOVSD %XMM1,(%RAX) |
(272) 0x44210e SUB %RDX,%RAX |
(272) 0x442111 VMOVSD (%R12),%XMM2 |
(272) 0x442117 INC %R13D |
(272) 0x44211a ADD %RDX,%R12 |
(272) 0x44211d VMOVSD %XMM2,(%RAX) |
(272) 0x442121 SUB %RDX,%RAX |
(272) 0x442124 VMOVSD (%R12),%XMM3 |
(272) 0x44212a INC %R13D |
(272) 0x44212d ADD %RDX,%R12 |
(272) 0x442130 VMOVSD %XMM3,(%RAX) |
(272) 0x442134 SUB %RDX,%RAX |
(272) 0x442137 VMOVSD (%R12),%XMM4 |
(272) 0x44213d INC %R13D |
(272) 0x442140 ADD %RDX,%R12 |
(272) 0x442143 VMOVSD %XMM4,(%RAX) |
(272) 0x442147 SUB %RDX,%RAX |
(272) 0x44214a VMOVSD (%R12),%XMM5 |
(272) 0x442150 INC %R13D |
(272) 0x442153 ADD %RDX,%R12 |
(272) 0x442156 VMOVSD %XMM5,(%RAX) |
(272) 0x44215a SUB %RDX,%RAX |
(272) 0x44215d VMOVSD (%R12),%XMM6 |
(272) 0x442163 INC %R13D |
(272) 0x442166 ADD %RDX,%R12 |
(272) 0x442169 VMOVSD %XMM6,(%RAX) |
(272) 0x44216d SUB %RDX,%RAX |
(272) 0x442170 CMP %R13D,%EBX |
(272) 0x442173 JE 442206 |
(271) 0x442179 VMOVSD (%R12),%XMM7 |
(271) 0x44217f ADD %RDX,%R12 |
(271) 0x442182 ADD $0x8,%R13D |
(271) 0x442186 VMOVSD %XMM7,(%RAX) |
(271) 0x44218a SUB %RDX,%RAX |
(271) 0x44218d VMOVSD (%R12),%XMM8 |
(271) 0x442193 ADD %RDX,%R12 |
(271) 0x442196 VMOVSD %XMM8,(%RAX) |
(271) 0x44219a SUB %RDX,%RAX |
(271) 0x44219d VMOVSD (%R12),%XMM9 |
(271) 0x4421a3 ADD %RDX,%R12 |
(271) 0x4421a6 VMOVSD %XMM9,(%RAX) |
(271) 0x4421aa SUB %RDX,%RAX |
(271) 0x4421ad VMOVSD (%R12),%XMM10 |
(271) 0x4421b3 ADD %RDX,%R12 |
(271) 0x4421b6 VMOVSD %XMM10,(%RAX) |
(271) 0x4421ba SUB %RDX,%RAX |
(271) 0x4421bd VMOVSD (%R12),%XMM11 |
(271) 0x4421c3 ADD %RDX,%R12 |
(271) 0x4421c6 VMOVSD %XMM11,(%RAX) |
(271) 0x4421ca SUB %RDX,%RAX |
(271) 0x4421cd VMOVSD (%R12),%XMM12 |
(271) 0x4421d3 ADD %RDX,%R12 |
(271) 0x4421d6 VMOVSD %XMM12,(%RAX) |
(271) 0x4421da SUB %RDX,%RAX |
(271) 0x4421dd VMOVSD (%R12),%XMM13 |
(271) 0x4421e3 ADD %RDX,%R12 |
(271) 0x4421e6 VMOVSD %XMM13,(%RAX) |
(271) 0x4421ea SUB %RDX,%RAX |
(271) 0x4421ed VMOVSD (%R12),%XMM14 |
(271) 0x4421f3 ADD %RDX,%R12 |
(271) 0x4421f6 VMOVSD %XMM14,(%RAX) |
(271) 0x4421fa SUB %RDX,%RAX |
(271) 0x4421fd CMP %R13D,%EBX |
(271) 0x442200 JNE 442179 |
(272) 0x442206 INC %RSI |
(272) 0x442209 LEA (%R10,%RSI,1),%R12D |
(272) 0x44220d CMP %R12D,%EDI |
(272) 0x442210 JG 4420a0 |
0x442216 POP %RBX |
0x442217 POP %R12 |
0x442219 POP %R13 |
0x44221b POP %R14 |
0x44221d POP %RBP |
0x44221e RET |
0x44221f INC %EAX |
0x442221 XOR %EDX,%EDX |
0x442223 JMP 442051 |
0x442228 NOPL (%RAX,%RAX,1) |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
○95.01 | gomp_thread_start | team.c:130 | libgomp.so.1.0.0 |
Path / |
Source file and lines | update_halo.cpp:183-186 |
Module | exec |
nb instructions | 56 |
nb uops | 61 |
loop length | 170 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 0 |
micro-operation queue | 10.17 cycles |
front end | 10.17 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 5.80 | 5.80 | 4.33 | 4.33 | 3.50 | 5.80 | 5.80 | 3.50 | 3.50 | 3.50 | 5.80 | 4.33 |
cycles | 5.80 | 8.87 | 4.33 | 4.33 | 3.50 | 5.80 | 5.80 | 3.50 | 3.50 | 3.50 | 5.80 | 4.33 |
Cycles executing div or sqrt instructions | 6.00 |
FE+BE cycles | 15.07-16.47 |
Stall cycles | 4.96-6.37 |
LM full (events) | 6.49-7.92 |
Front-end | 10.17 |
Dispatch | 8.87 |
DIV/SQRT | 6.00 |
Overall L1 | 10.17 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 0% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 0% |
other | 0% |
all | 7% |
load | 9% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 6% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 6% |
other | 7% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R14 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R13 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RDI,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R12 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RBX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV 0x14(%RDI),%EBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x8(%RDI),%EDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %EBX,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
LEA 0x1(%RDI),%R14D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
CALL 4046c0 <omp_get_num_threads@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
MOV %EAX,%R12D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CALL 4045b0 <omp_get_thread_num@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
MOV %EAX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV 0xc(%R13),%EAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %EBX,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD $0x2,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
SUB %R14D,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CLTD | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
IDIV %R12D | 4 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11-16 | 6 |
CMP %EDX,%ECX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JL 44221f <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.13+0x20f> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
IMUL %EAX,%ECX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD %ECX,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD %EDX,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP %EAX,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JGE 442216 <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.13+0x206> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0x10(%R13),%R8D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%R13),%R10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%R14,%RDX,1),%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
LEA (%R14,%RAX,1),%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
TEST %EBX,%EBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
JLE 442216 <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.13+0x206> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0x48(%R10),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA 0x1(%R8),%R11D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
MOV 0x58(%R10),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVSXD %ESI,%RSI | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
MOVSXD %R11D,%R8 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
XOR %R10D,%R10D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
IMUL %R14,%R8 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %R14,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
NEG %RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
SAL $0x3,%RDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
ADD %R8,%R14 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
XCHG %AX,%AX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
POP %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
RET | 1 | 0.50 | 0 | 0.33 | 0.33 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0.33 | 0 | 2.13 |
INC %EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 442051 <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.13+0x41> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
Source file and lines | update_halo.cpp:183-186 |
Module | exec |
nb instructions | 56 |
nb uops | 61 |
loop length | 170 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 0 |
micro-operation queue | 10.17 cycles |
front end | 10.17 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 5.80 | 5.80 | 4.33 | 4.33 | 3.50 | 5.80 | 5.80 | 3.50 | 3.50 | 3.50 | 5.80 | 4.33 |
cycles | 5.80 | 8.87 | 4.33 | 4.33 | 3.50 | 5.80 | 5.80 | 3.50 | 3.50 | 3.50 | 5.80 | 4.33 |
Cycles executing div or sqrt instructions | 6.00 |
FE+BE cycles | 15.07-16.47 |
Stall cycles | 4.96-6.37 |
LM full (events) | 6.49-7.92 |
Front-end | 10.17 |
Dispatch | 8.87 |
DIV/SQRT | 6.00 |
Overall L1 | 10.17 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 0% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 0% |
other | 0% |
all | 7% |
load | 9% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 6% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 6% |
other | 7% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R14 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R13 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RDI,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R12 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RBX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV 0x14(%RDI),%EBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x8(%RDI),%EDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %EBX,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
LEA 0x1(%RDI),%R14D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
CALL 4046c0 <omp_get_num_threads@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
MOV %EAX,%R12D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CALL 4045b0 <omp_get_thread_num@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
MOV %EAX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV 0xc(%R13),%EAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %EBX,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD $0x2,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
SUB %R14D,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CLTD | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
IDIV %R12D | 4 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11-16 | 6 |
CMP %EDX,%ECX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JL 44221f <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.13+0x20f> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
IMUL %EAX,%ECX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD %ECX,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD %EDX,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP %EAX,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JGE 442216 <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.13+0x206> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0x10(%R13),%R8D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%R13),%R10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%R14,%RDX,1),%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
LEA (%R14,%RAX,1),%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
TEST %EBX,%EBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
JLE 442216 <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.13+0x206> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0x48(%R10),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA 0x1(%R8),%R11D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
MOV 0x58(%R10),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVSXD %ESI,%RSI | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
MOVSXD %R11D,%R8 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
XOR %R10D,%R10D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
IMUL %R14,%R8 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %R14,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
NEG %RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
SAL $0x3,%RDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
ADD %R8,%R14 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
XCHG %AX,%AX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
POP %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
RET | 1 | 0.50 | 0 | 0.33 | 0.33 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0.33 | 0 | 2.13 |
INC %EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 442051 <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.13+0x41> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
Name | Coverage (%) | Time (s) |
---|---|---|
▼update_halo_kernel(int, int, int, int, std::array | 0.01 | 0 |
▼Loop 272 - update_halo.cpp:185-186 - exec– | 0 | 0.01 |
○Loop 271 - update_halo.cpp:185-186 - exec | 0 | 0 |