Function: update_halo_kernel(int, int, int, int, std::array<int, 4ul> const&, std::array<int, 4ul> c ... | Module: exec | Source: update_halo.cpp:97-100 [...] | Coverage: 0.01% |
---|
Function: update_halo_kernel(int, int, int, int, std::array<int, 4ul> const&, std::array<int, 4ul> c ... | Module: exec | Source: update_halo.cpp:97-100 [...] | Coverage: 0.01% |
---|
/scratch_na/users/xoserete/qaas_runs/171-415-4969/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/update_halo.cpp: 97 - 100 |
-------------------------------------------------------------------------------- |
97: #pragma omp parallel for simd |
98: for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { |
99: for (int k = 0; k < depth; ++k) { |
100: field.density1(j, y_max + 2 + k) = field.density1(j, y_max + 1 - k); |
/scratch_na/users/xoserete/qaas_runs/171-415-4969/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/context.h: 69 - 69 |
-------------------------------------------------------------------------------- |
69: T &operator()(size_t i, size_t j) const { return data[i + j * sizeX]; } |
0x440330 PUSH %RBP |
0x440331 MOV %RSP,%RBP |
0x440334 PUSH %R14 |
0x440336 PUSH %R13 |
0x440338 MOV %RDI,%R13 |
0x44033b PUSH %R12 |
0x44033d PUSH %RBX |
0x44033e MOV 0x14(%RDI),%EBX |
0x440341 MOV 0x8(%RDI),%EDI |
0x440344 SUB %EBX,%EDI |
0x440346 LEA 0x1(%RDI),%R14D |
0x44034a CALL 4046c0 <omp_get_num_threads@plt> |
0x44034f MOV %EAX,%R12D |
0x440352 CALL 4045b0 <omp_get_thread_num@plt> |
0x440357 MOV %EAX,%ECX |
0x440359 MOV 0xc(%R13),%EAX |
0x44035d ADD %EBX,%EAX |
0x44035f ADD $0x2,%EAX |
0x440362 SUB %R14D,%EAX |
0x440365 CLTD |
0x440366 IDIV %R12D |
0x440369 CMP %EDX,%ECX |
0x44036b JL 44053f |
0x440371 IMUL %EAX,%ECX |
0x440374 ADD %ECX,%EDX |
0x440376 ADD %EDX,%EAX |
0x440378 CMP %EAX,%EDX |
0x44037a JGE 440536 |
0x440380 MOV 0x10(%R13),%R8D |
0x440384 MOV (%R13),%R10 |
0x440388 LEA (%R14,%RDX,1),%ESI |
0x44038c LEA (%R14,%RAX,1),%EDI |
0x440390 TEST %EBX,%EBX |
0x440392 JLE 440536 |
0x440398 MOV 0x18(%R10),%R14 |
0x44039c LEA 0x1(%R8),%R11D |
0x4403a0 MOV 0x28(%R10),%R9 |
0x4403a4 MOVSXD %ESI,%RSI |
0x4403a7 MOVSXD %R11D,%R8 |
0x4403aa XOR %R10D,%R10D |
0x4403ad IMUL %R14,%R8 |
0x4403b1 MOV %R14,%RDX |
0x4403b4 NEG %RDX |
0x4403b7 SAL $0x3,%RDX |
0x4403bb ADD %R8,%R14 |
0x4403be XCHG %AX,%AX |
(248) 0x4403c0 LEA (%RSI,%R8,1),%R13 |
(248) 0x4403c4 LEA (%RSI,%R14,1),%RCX |
(248) 0x4403c8 MOV %EBX,%R11D |
(248) 0x4403cb LEA (%R9,%R13,8),%R12 |
(248) 0x4403cf LEA (%R9,%RCX,8),%RAX |
(248) 0x4403d3 XOR %R13D,%R13D |
(248) 0x4403d6 AND $0x7,%R11D |
(248) 0x4403da JE 440499 |
(248) 0x4403e0 CMP $0x1,%R11D |
(248) 0x4403e4 JE 44047d |
(248) 0x4403ea CMP $0x2,%R11D |
(248) 0x4403ee JE 44046a |
(248) 0x4403f0 CMP $0x3,%R11D |
(248) 0x4403f4 JE 440457 |
(248) 0x4403f6 CMP $0x4,%R11D |
(248) 0x4403fa JE 440444 |
(248) 0x4403fc CMP $0x5,%R11D |
(248) 0x440400 JE 440431 |
(248) 0x440402 CMP $0x6,%R11D |
(248) 0x440406 JE 44041e |
(248) 0x440408 VMOVSD (%R12),%XMM0 |
(248) 0x44040e MOV $0x1,%R13D |
(248) 0x440414 ADD %RDX,%R12 |
(248) 0x440417 VMOVSD %XMM0,(%RAX) |
(248) 0x44041b SUB %RDX,%RAX |
(248) 0x44041e VMOVSD (%R12),%XMM1 |
(248) 0x440424 INC %R13D |
(248) 0x440427 ADD %RDX,%R12 |
(248) 0x44042a VMOVSD %XMM1,(%RAX) |
(248) 0x44042e SUB %RDX,%RAX |
(248) 0x440431 VMOVSD (%R12),%XMM2 |
(248) 0x440437 INC %R13D |
(248) 0x44043a ADD %RDX,%R12 |
(248) 0x44043d VMOVSD %XMM2,(%RAX) |
(248) 0x440441 SUB %RDX,%RAX |
(248) 0x440444 VMOVSD (%R12),%XMM3 |
(248) 0x44044a INC %R13D |
(248) 0x44044d ADD %RDX,%R12 |
(248) 0x440450 VMOVSD %XMM3,(%RAX) |
(248) 0x440454 SUB %RDX,%RAX |
(248) 0x440457 VMOVSD (%R12),%XMM4 |
(248) 0x44045d INC %R13D |
(248) 0x440460 ADD %RDX,%R12 |
(248) 0x440463 VMOVSD %XMM4,(%RAX) |
(248) 0x440467 SUB %RDX,%RAX |
(248) 0x44046a VMOVSD (%R12),%XMM5 |
(248) 0x440470 INC %R13D |
(248) 0x440473 ADD %RDX,%R12 |
(248) 0x440476 VMOVSD %XMM5,(%RAX) |
(248) 0x44047a SUB %RDX,%RAX |
(248) 0x44047d VMOVSD (%R12),%XMM6 |
(248) 0x440483 INC %R13D |
(248) 0x440486 ADD %RDX,%R12 |
(248) 0x440489 VMOVSD %XMM6,(%RAX) |
(248) 0x44048d SUB %RDX,%RAX |
(248) 0x440490 CMP %R13D,%EBX |
(248) 0x440493 JE 440526 |
(247) 0x440499 VMOVSD (%R12),%XMM7 |
(247) 0x44049f ADD %RDX,%R12 |
(247) 0x4404a2 ADD $0x8,%R13D |
(247) 0x4404a6 VMOVSD %XMM7,(%RAX) |
(247) 0x4404aa SUB %RDX,%RAX |
(247) 0x4404ad VMOVSD (%R12),%XMM8 |
(247) 0x4404b3 ADD %RDX,%R12 |
(247) 0x4404b6 VMOVSD %XMM8,(%RAX) |
(247) 0x4404ba SUB %RDX,%RAX |
(247) 0x4404bd VMOVSD (%R12),%XMM9 |
(247) 0x4404c3 ADD %RDX,%R12 |
(247) 0x4404c6 VMOVSD %XMM9,(%RAX) |
(247) 0x4404ca SUB %RDX,%RAX |
(247) 0x4404cd VMOVSD (%R12),%XMM10 |
(247) 0x4404d3 ADD %RDX,%R12 |
(247) 0x4404d6 VMOVSD %XMM10,(%RAX) |
(247) 0x4404da SUB %RDX,%RAX |
(247) 0x4404dd VMOVSD (%R12),%XMM11 |
(247) 0x4404e3 ADD %RDX,%R12 |
(247) 0x4404e6 VMOVSD %XMM11,(%RAX) |
(247) 0x4404ea SUB %RDX,%RAX |
(247) 0x4404ed VMOVSD (%R12),%XMM12 |
(247) 0x4404f3 ADD %RDX,%R12 |
(247) 0x4404f6 VMOVSD %XMM12,(%RAX) |
(247) 0x4404fa SUB %RDX,%RAX |
(247) 0x4404fd VMOVSD (%R12),%XMM13 |
(247) 0x440503 ADD %RDX,%R12 |
(247) 0x440506 VMOVSD %XMM13,(%RAX) |
(247) 0x44050a SUB %RDX,%RAX |
(247) 0x44050d VMOVSD (%R12),%XMM14 |
(247) 0x440513 ADD %RDX,%R12 |
(247) 0x440516 VMOVSD %XMM14,(%RAX) |
(247) 0x44051a SUB %RDX,%RAX |
(247) 0x44051d CMP %R13D,%EBX |
(247) 0x440520 JNE 440499 |
(248) 0x440526 INC %RSI |
(248) 0x440529 LEA (%R10,%RSI,1),%R12D |
(248) 0x44052d CMP %R12D,%EDI |
(248) 0x440530 JG 4403c0 |
0x440536 POP %RBX |
0x440537 POP %R12 |
0x440539 POP %R13 |
0x44053b POP %R14 |
0x44053d POP %RBP |
0x44053e RET |
0x44053f INC %EAX |
0x440541 XOR %EDX,%EDX |
0x440543 JMP 440371 |
0x440548 NOPL (%RAX,%RAX,1) |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
○100.00 | gomp_thread_start | team.c:130 | libgomp.so.1.0.0 |
Path / |
Source file and lines | update_halo.cpp:97-100 |
Module | exec |
nb instructions | 56 |
nb uops | 61 |
loop length | 170 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 0 |
micro-operation queue | 10.17 cycles |
front end | 10.17 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 5.80 | 5.80 | 4.33 | 4.33 | 3.50 | 5.80 | 5.80 | 3.50 | 3.50 | 3.50 | 5.80 | 4.33 |
cycles | 5.80 | 8.87 | 4.33 | 4.33 | 3.50 | 5.80 | 5.80 | 3.50 | 3.50 | 3.50 | 5.80 | 4.33 |
Cycles executing div or sqrt instructions | 6.00 |
FE+BE cycles | 15.07-16.47 |
Stall cycles | 4.96-6.37 |
LM full (events) | 6.49-7.92 |
Front-end | 10.17 |
Dispatch | 8.87 |
DIV/SQRT | 6.00 |
Overall L1 | 10.17 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 0% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 0% |
other | 0% |
all | 7% |
load | 9% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 6% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 6% |
other | 7% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R14 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R13 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RDI,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R12 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RBX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV 0x14(%RDI),%EBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x8(%RDI),%EDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %EBX,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
LEA 0x1(%RDI),%R14D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
CALL 4046c0 <omp_get_num_threads@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
MOV %EAX,%R12D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CALL 4045b0 <omp_get_thread_num@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
MOV %EAX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV 0xc(%R13),%EAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %EBX,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD $0x2,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
SUB %R14D,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CLTD | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
IDIV %R12D | 4 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11-16 | 6 |
CMP %EDX,%ECX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JL 44053f <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.5+0x20f> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
IMUL %EAX,%ECX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD %ECX,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD %EDX,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP %EAX,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JGE 440536 <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.5+0x206> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0x10(%R13),%R8D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%R13),%R10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%R14,%RDX,1),%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
LEA (%R14,%RAX,1),%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
TEST %EBX,%EBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
JLE 440536 <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.5+0x206> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0x18(%R10),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA 0x1(%R8),%R11D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
MOV 0x28(%R10),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVSXD %ESI,%RSI | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
MOVSXD %R11D,%R8 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
XOR %R10D,%R10D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
IMUL %R14,%R8 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %R14,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
NEG %RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
SAL $0x3,%RDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
ADD %R8,%R14 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
XCHG %AX,%AX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
POP %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
RET | 1 | 0.50 | 0 | 0.33 | 0.33 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0.33 | 0 | 2.13 |
INC %EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 440371 <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.5+0x41> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
Source file and lines | update_halo.cpp:97-100 |
Module | exec |
nb instructions | 56 |
nb uops | 61 |
loop length | 170 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 0 |
micro-operation queue | 10.17 cycles |
front end | 10.17 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 5.80 | 5.80 | 4.33 | 4.33 | 3.50 | 5.80 | 5.80 | 3.50 | 3.50 | 3.50 | 5.80 | 4.33 |
cycles | 5.80 | 8.87 | 4.33 | 4.33 | 3.50 | 5.80 | 5.80 | 3.50 | 3.50 | 3.50 | 5.80 | 4.33 |
Cycles executing div or sqrt instructions | 6.00 |
FE+BE cycles | 15.07-16.47 |
Stall cycles | 4.96-6.37 |
LM full (events) | 6.49-7.92 |
Front-end | 10.17 |
Dispatch | 8.87 |
DIV/SQRT | 6.00 |
Overall L1 | 10.17 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 0% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 0% |
other | 0% |
all | 7% |
load | 9% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 6% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 6% |
other | 7% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R14 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R13 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RDI,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R12 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RBX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV 0x14(%RDI),%EBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x8(%RDI),%EDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %EBX,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
LEA 0x1(%RDI),%R14D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
CALL 4046c0 <omp_get_num_threads@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
MOV %EAX,%R12D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CALL 4045b0 <omp_get_thread_num@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
MOV %EAX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV 0xc(%R13),%EAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %EBX,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD $0x2,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
SUB %R14D,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CLTD | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
IDIV %R12D | 4 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11-16 | 6 |
CMP %EDX,%ECX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JL 44053f <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.5+0x20f> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
IMUL %EAX,%ECX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD %ECX,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD %EDX,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP %EAX,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JGE 440536 <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.5+0x206> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0x10(%R13),%R8D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%R13),%R10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%R14,%RDX,1),%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
LEA (%R14,%RAX,1),%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
TEST %EBX,%EBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
JLE 440536 <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.5+0x206> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0x18(%R10),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA 0x1(%R8),%R11D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
MOV 0x28(%R10),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVSXD %ESI,%RSI | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
MOVSXD %R11D,%R8 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
XOR %R10D,%R10D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
IMUL %R14,%R8 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %R14,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
NEG %RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
SAL $0x3,%RDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
ADD %R8,%R14 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
XCHG %AX,%AX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
POP %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
RET | 1 | 0.50 | 0 | 0.33 | 0.33 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0.33 | 0 | 2.13 |
INC %EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 440371 <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.5+0x41> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
Name | Coverage (%) | Time (s) |
---|---|---|
▼update_halo_kernel(int, int, int, int, std::array | 0.01 | 0 |
▼Loop 248 - update_halo.cpp:99-100 - exec– | 0 | 0.01 |
○Loop 247 - update_halo.cpp:99-100 - exec | 0 | 0 |