Function: clover_pack_message_right_.DIR.OMP.PARALLEL.LOOP.2.split100 | Module: exec | Source: pack_kernel.f90:155-163 | Coverage: 0.04% |
---|
Function: clover_pack_message_right_.DIR.OMP.PARALLEL.LOOP.2.split100 | Module: exec | Source: pack_kernel.f90:155-163 | Coverage: 0.04% |
---|
/home/eoseret/qaas_runs_CPU_9468/171-137-7698/intel/CloverLeafFC/build/CloverLeafFC/CloverLeaf_ref/kernels/pack_kernel.f90: 155 - 163 |
-------------------------------------------------------------------------------- |
155: !$OMP PARALLEL DO PRIVATE(index) |
156: DO k=y_min-depth,y_max+y_inc+depth |
157: !$OMP SIMD |
158: DO j=1,depth |
159: index= buffer_offset + j+(k+depth-1)*depth |
160: right_snd_buffer(index)=field(x_max+1-j,k) |
161: ENDDO |
162: ENDDO |
163: !$OMP END PARALLEL DO |
0x4493c0 PUSH %RBP |
0x4493c1 MOV %RSP,%RBP |
0x4493c4 PUSH %R15 |
0x4493c6 PUSH %R14 |
0x4493c8 PUSH %R13 |
0x4493ca PUSH %R12 |
0x4493cc PUSH %RBX |
0x4493cd SUB $0x58,%RSP |
0x4493d1 MOV %R8,-0x58(%RBP) |
0x4493d5 MOV %RCX,-0x70(%RBP) |
0x4493d9 MOV 0x28(%RBP),%EAX |
0x4493dc MOVL $0,-0x4c(%RBP) |
0x4493e3 TEST %EAX,%EAX |
0x4493e5 JS 449446 |
0x4493e7 MOV %RDX,%RBX |
0x4493ea MOV (%RDI),%ESI |
0x4493ec MOVL $0,-0x34(%RBP) |
0x4493f3 MOV %EAX,-0x30(%RBP) |
0x4493f6 MOVL $0x1,-0x48(%RBP) |
0x4493fd SUB $0x8,%RSP |
0x449401 LEA -0x48(%RBP),%RAX |
0x449405 LEA -0x4c(%RBP),%RCX |
0x449409 LEA -0x34(%RBP),%R8 |
0x44940d LEA -0x30(%RBP),%R9 |
0x449411 MOV $0x54e590,%EDI |
0x449416 MOV %ESI,-0x44(%RBP) |
0x449419 MOV $0x22,%EDX |
0x44941e PUSH $0x1 |
0x449420 PUSH $0x1 |
0x449422 PUSH %RAX |
0x449423 CALL 404670 <__kmpc_for_static_init_4@plt> |
0x449428 ADD $0x20,%RSP |
0x44942c MOV -0x34(%RBP),%EAX |
0x44942f MOV -0x30(%RBP),%EDX |
0x449432 SUB %EAX,%EDX |
0x449434 JAE 449480 |
0x449436 MOV $0x54e5b0,%EDI |
0x44943b MOV -0x44(%RBP),%ESI |
0x44943e VZEROUPPER |
0x449441 CALL 404230 <__kmpc_for_static_fini@plt> |
0x449446 ADD $0x58,%RSP |
0x44944a POP %RBX |
0x44944b POP %R12 |
0x44944d POP %R13 |
0x44944f POP %R14 |
0x449451 POP %R15 |
0x449453 POP %RBP |
0x449454 RET |
0x449455 NOPW %CS:(%RAX,%RAX,1) |
0x449464 NOPW %CS:(%RAX,%RAX,1) |
0x449473 NOPW %CS:(%RAX,%RAX,1) |
0x449480 MOV %RAX,%RCX |
0x449483 MOV -0x58(%RBP),%RAX |
0x449487 MOV (%RAX),%ESI |
0x449489 LEA -0x1(%RCX,%RBX,1),%EDI |
0x44948d XOR %R8D,%R8D |
0x449490 ADD %EBX,%ECX |
0x449492 MOV %RCX,-0x68(%RBP) |
0x449496 VMOVDQA64 0xc15e0(%RIP),%ZMM0 |
0x4494a0 VMOVDQA 0xc24d8(%RIP),%YMM1 |
0x4494a8 VPTERNLOGD $-0x1,%ZMM2,%ZMM2,%ZMM2 |
0x4494af VMOVDQA64 0xc15c7(%RIP),%ZMM3 |
0x4494b9 MOV %EDX,-0x2c(%RBP) |
0x4494bc JMP 4494d8 |
0x4494be XCHG %AX,%AX |
(440) 0x4494c0 MOV %ESI,%R10D |
(440) 0x4494c3 LEA 0x1(%R8),%EAX |
(440) 0x4494c7 INC %EDI |
(440) 0x4494c9 MOV %R10D,%ESI |
(440) 0x4494cc CMP %EDX,%R8D |
(440) 0x4494cf MOV %EAX,%R8D |
(440) 0x4494d2 JE 449436 |
(440) 0x4494d8 TEST %ESI,%ESI |
(440) 0x4494da JLE 4494c0 |
(440) 0x4494dc MOV -0x68(%RBP),%RAX |
(440) 0x4494e0 LEA (%RAX,%R8,1),%R12D |
(440) 0x4494e4 MOV -0x70(%RBP),%RAX |
(440) 0x4494e8 MOVSXD (%RAX),%RAX |
(440) 0x4494eb MOV %RAX,-0x40(%RBP) |
(440) 0x4494ef MOV -0x58(%RBP),%RCX |
(440) 0x4494f3 MOV (%RCX),%R10D |
(440) 0x4494f6 MOV 0x10(%RBP),%R9 |
(440) 0x4494fa MOV (%R9),%RDX |
(440) 0x4494fd MOV 0x38(%R9),%R13 |
(440) 0x449501 MOV 0x18(%RBP),%RCX |
(440) 0x449505 MOV (%RCX),%EAX |
(440) 0x449507 MOV 0x50(%R9),%R9 |
(440) 0x44950b MOV 0x10a7ee(%RIP),%R15 |
(440) 0x449512 MOV 0x10a81f(%RIP),%RCX |
(440) 0x449519 MOV %ESI,%R14D |
(440) 0x44951c MOV %R14,%R11 |
(440) 0x44951f MOVSXD %R12D,%RBX |
(440) 0x449522 MOV $-0x8,%ESI |
(440) 0x449527 AND %RSI,%R11 |
(440) 0x44952a JE 449600 |
(440) 0x449530 LEA (%R10,%RDI,1),%ESI |
(440) 0x449534 IMUL %R10D,%ESI |
(440) 0x449538 MOVSXD %ESI,%RSI |
(440) 0x44953b MOV %RCX,-0x78(%RBP) |
(440) 0x44953f MOV -0x40(%RBP),%RCX |
(440) 0x449543 ADD %RCX,%RSI |
(440) 0x449546 VPBROADCASTQ %R13,%ZMM5 |
(440) 0x44954c LEA -0x1(%R12,%R10,1),%R12D |
(440) 0x449551 MOV %R10,-0x60(%RBP) |
(440) 0x449555 IMUL %R10D,%R12D |
(440) 0x449559 MOVSXD %R12D,%R12 |
(440) 0x44955c ADD %RCX,%R12 |
(440) 0x44955f MOV -0x78(%RBP),%RCX |
(440) 0x449563 VPBROADCASTQ %RCX,%ZMM6 |
(440) 0x449569 MOV %EAX,-0x40(%RBP) |
(440) 0x44956c XOR %ECX,%ECX |
(440) 0x44956e XCHG %AX,%AX |
(441) 0x449570 LEA 0x1(%RBX),%R13 |
(441) 0x449574 IMUL %R9,%R13 |
(441) 0x449578 VPBROADCASTD %EAX,%YMM7 |
(441) 0x44957e VPADDD %YMM1,%YMM7,%YMM7 |
(441) 0x449582 VPMOVSXDQ %YMM7,%ZMM7 |
(441) 0x449588 VPSUBQ %ZMM2,%ZMM7,%ZMM7 |
(441) 0x44958e VPMULLQ %ZMM7,%ZMM5,%ZMM7 |
(441) 0x449594 LEA (%RDX,%R13,1),%R10 |
(441) 0x449598 KXNORW %K0,%K0,%K1 |
(441) 0x44959c VXORPD %XMM8,%XMM8,%XMM8 |
(441) 0x4495a1 VGATHERQPD (%R10,%ZMM7,1),%ZMM8{%K1} |
(441) 0x4495a8 LEA (%RSI,%RCX,1),%R10 |
(441) 0x4495ac VPBROADCASTQ %R10,%ZMM7 |
(441) 0x4495b2 VPADDQ %ZMM3,%ZMM7,%ZMM7 |
(441) 0x4495b8 VPMULLQ %ZMM7,%ZMM6,%ZMM7 |
(441) 0x4495be KXNORW %K0,%K0,%K1 |
(441) 0x4495c2 VSCATTERQPD %ZMM8,(%R15,%ZMM7,1){%K1} |
(441) 0x4495c9 ADD $0x8,%RCX |
(441) 0x4495cd ADD $-0x8,%EAX |
(441) 0x4495d0 CMP %R11,%RCX |
(441) 0x4495d3 JB 449570 |
(440) 0x4495d5 CMP %R14,%R11 |
(440) 0x4495d8 JNE 449680 |
(440) 0x4495de MOV -0x2c(%RBP),%EDX |
(440) 0x4495e1 MOV -0x60(%RBP),%R10 |
(440) 0x4495e5 JMP 4494c3 |
0x4495ea NOPW %CS:(%RAX,%RAX,1) |
0x4495f9 NOPL (%RAX) |
(440) 0x449600 VPBROADCASTQ %R14,%ZMM8 |
(440) 0x449606 VPBROADCASTQ %RDX,%ZMM9 |
(440) 0x44960c INC %RBX |
(440) 0x44960f IMUL %RBX,%R9 |
(440) 0x449613 VPBROADCASTQ %R9,%ZMM11 |
(440) 0x449619 VPBROADCASTD %EAX,%YMM10 |
(440) 0x44961f VPBROADCASTQ %R13,%ZMM5 |
(440) 0x449625 VPBROADCASTQ %R15,%ZMM7 |
(440) 0x44962b LEA -0x1(%R12,%R10,1),%EDX |
(440) 0x449630 IMUL %R10D,%EDX |
(440) 0x449634 MOVSXD %EDX,%R12 |
(440) 0x449637 ADD -0x40(%RBP),%R12 |
(440) 0x44963b VPBROADCASTQ %RCX,%ZMM6 |
(440) 0x449641 XOR %R11D,%R11D |
(440) 0x449644 MOV -0x2c(%RBP),%EDX |
(440) 0x449647 JMP 4496a8 |
0x449649 NOPW %CS:(%RAX,%RAX,1) |
0x449658 NOPW %CS:(%RAX,%RAX,1) |
0x449667 NOPW %CS:(%RAX,%RAX,1) |
0x449676 NOPW %CS:(%RAX,%RAX,1) |
(440) 0x449680 VPBROADCASTQ %R13,%ZMM11 |
(440) 0x449686 VPBROADCASTQ %R15,%ZMM7 |
(440) 0x44968c VPBROADCASTQ %RDX,%ZMM9 |
(440) 0x449692 MOV -0x40(%RBP),%EAX |
(440) 0x449695 VPBROADCASTD %EAX,%YMM10 |
(440) 0x44969b VPBROADCASTQ %R14,%ZMM8 |
(440) 0x4496a1 MOV -0x2c(%RBP),%EDX |
(440) 0x4496a4 MOV -0x60(%RBP),%R10 |
(440) 0x4496a8 VPADDQ %ZMM11,%ZMM9,%ZMM9 |
(440) 0x4496ae VPBROADCASTQ %R11,%ZMM11 |
(440) 0x4496b4 VPSUBQ %ZMM11,%ZMM8,%ZMM8 |
(440) 0x4496ba VPCMPNLEUQ %ZMM0,%ZMM8,%K1 |
(440) 0x4496c1 VPBROADCASTD %R11D,%YMM8 |
(440) 0x4496c7 VPSUBD %YMM8,%YMM10,%YMM8 |
(440) 0x4496cc VPADDD %YMM1,%YMM8,%YMM8 |
(440) 0x4496d0 VPMOVSXDQ %YMM8,%ZMM8 |
(440) 0x4496d6 VPSUBQ %ZMM2,%ZMM8,%ZMM8 |
(440) 0x4496dc VPMULLQ %ZMM8,%ZMM5,%ZMM5 |
(440) 0x4496e2 VPADDQ %ZMM5,%ZMM9,%ZMM5 |
(440) 0x4496e8 VPXOR %XMM8,%XMM8,%XMM8 |
(440) 0x4496ed KMOVQ %K1,%K2 |
(440) 0x4496f2 VGATHERQPD (,%ZMM5,1),%ZMM8{%K2} |
(440) 0x4496fd VMOVAPD %ZMM8,%ZMM4{%K1} |
(440) 0x449703 ADD %R11,%R12 |
(440) 0x449706 VPBROADCASTQ %R12,%ZMM5 |
(440) 0x44970c VPADDQ %ZMM0,%ZMM5,%ZMM5 |
(440) 0x449712 VPMULLQ %ZMM5,%ZMM6,%ZMM5 |
(440) 0x449718 VPADDQ %ZMM5,%ZMM7,%ZMM5 |
(440) 0x44971e VSCATTERQPD %ZMM4,(,%ZMM5,1){%K1} |
(440) 0x449729 JMP 4494c3 |
0x44972e NOPW %CS:(%RAX,%RAX,1) |
0x449738 NOPL (%RAX,%RAX,1) |
Path / |
Source file and lines | pack_kernel.f90:155-163 |
Module | exec |
nb instructions | 73 |
nb uops | 76 |
loop length | 351 |
used x86 registers | 14 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 1 |
used zmm registers | 3 |
nb stack references | 10 |
micro-operation queue | 12.67 cycles |
front end | 12.67 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 2.30 | 2.20 | 5.67 | 5.67 | 10.00 | 2.20 | 2.10 | 10.00 | 10.00 | 10.00 | 2.20 | 5.67 |
cycles | 2.30 | 2.20 | 5.67 | 5.67 | 10.00 | 2.20 | 2.10 | 10.00 | 10.00 | 10.00 | 2.20 | 5.67 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 12.29-12.35 |
Stall cycles | 0.00 |
Front-end | 12.67 |
Dispatch | 10.00 |
Overall L1 | 12.67 |
all | 19% |
load | 42% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 40% |
all | 21% |
load | 39% |
store | 8% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 11% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 28% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R15 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R14 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R13 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R12 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RBX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
SUB $0x58,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV %R8,-0x58(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RCX,-0x70(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x28(%RBP),%EAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVL $0,-0x4c(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
TEST %EAX,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
JS 449446 <pack_kernel_module_mp_clover_pack_message_right_.DIR.OMP.PARALLEL.LOOP.2.split100+0x86> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV %RDX,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV (%RDI),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVL $0,-0x34(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %EAX,-0x30(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOVL $0x1,-0x48(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
SUB $0x8,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
LEA -0x48(%RBP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA -0x4c(%RBP),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA -0x34(%RBP),%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA -0x30(%RBP),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV $0x54e590,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %ESI,-0x44(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV $0x22,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
PUSH $0x1 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH $0x1 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RAX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
CALL 404670 <__kmpc_for_static_init_4@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
ADD $0x20,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV -0x34(%RBP),%EAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x30(%RBP),%EDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %EAX,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JAE 449480 <pack_kernel_module_mp_clover_pack_message_right_.DIR.OMP.PARALLEL.LOOP.2.split100+0xc0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV $0x54e5b0,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV -0x44(%RBP),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VZEROUPPER | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
CALL 404230 <__kmpc_for_static_fini@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
ADD $0x58,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
POP %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
RET | 1 | 0.50 | 0 | 0.33 | 0.33 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0.33 | 0 | 2.13 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %RAX,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV -0x58(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RAX),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA -0x1(%RCX,%RBX,1),%EDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
XOR %R8D,%R8D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD %EBX,%ECX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %RCX,-0x68(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VMOVDQA64 0xc15e0(%RIP),%ZMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
VMOVDQA 0xc24d8(%RIP),%YMM1 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VPTERNLOGD $-0x1,%ZMM2,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VMOVDQA64 0xc15c7(%RIP),%ZMM3 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
MOV %EDX,-0x2c(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
JMP 4494d8 <pack_kernel_module_mp_clover_pack_message_right_.DIR.OMP.PARALLEL.LOOP.2.split100+0x118> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
XCHG %AX,%AX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
Source file and lines | pack_kernel.f90:155-163 |
Module | exec |
nb instructions | 73 |
nb uops | 76 |
loop length | 351 |
used x86 registers | 14 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 1 |
used zmm registers | 3 |
nb stack references | 10 |
micro-operation queue | 12.67 cycles |
front end | 12.67 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 2.30 | 2.20 | 5.67 | 5.67 | 10.00 | 2.20 | 2.10 | 10.00 | 10.00 | 10.00 | 2.20 | 5.67 |
cycles | 2.30 | 2.20 | 5.67 | 5.67 | 10.00 | 2.20 | 2.10 | 10.00 | 10.00 | 10.00 | 2.20 | 5.67 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 12.29-12.35 |
Stall cycles | 0.00 |
Front-end | 12.67 |
Dispatch | 10.00 |
Overall L1 | 12.67 |
all | 19% |
load | 42% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 40% |
all | 21% |
load | 39% |
store | 8% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 11% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 28% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R15 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R14 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R13 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R12 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RBX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
SUB $0x58,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV %R8,-0x58(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RCX,-0x70(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x28(%RBP),%EAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVL $0,-0x4c(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
TEST %EAX,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
JS 449446 <pack_kernel_module_mp_clover_pack_message_right_.DIR.OMP.PARALLEL.LOOP.2.split100+0x86> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV %RDX,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV (%RDI),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVL $0,-0x34(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %EAX,-0x30(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOVL $0x1,-0x48(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
SUB $0x8,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
LEA -0x48(%RBP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA -0x4c(%RBP),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA -0x34(%RBP),%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA -0x30(%RBP),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV $0x54e590,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %ESI,-0x44(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV $0x22,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
PUSH $0x1 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH $0x1 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RAX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
CALL 404670 <__kmpc_for_static_init_4@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
ADD $0x20,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV -0x34(%RBP),%EAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x30(%RBP),%EDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %EAX,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JAE 449480 <pack_kernel_module_mp_clover_pack_message_right_.DIR.OMP.PARALLEL.LOOP.2.split100+0xc0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV $0x54e5b0,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV -0x44(%RBP),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VZEROUPPER | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
CALL 404230 <__kmpc_for_static_fini@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
ADD $0x58,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
POP %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
RET | 1 | 0.50 | 0 | 0.33 | 0.33 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0.33 | 0 | 2.13 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %RAX,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV -0x58(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RAX),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA -0x1(%RCX,%RBX,1),%EDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
XOR %R8D,%R8D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD %EBX,%ECX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %RCX,-0x68(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VMOVDQA64 0xc15e0(%RIP),%ZMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
VMOVDQA 0xc24d8(%RIP),%YMM1 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VPTERNLOGD $-0x1,%ZMM2,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VMOVDQA64 0xc15c7(%RIP),%ZMM3 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
MOV %EDX,-0x2c(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
JMP 4494d8 <pack_kernel_module_mp_clover_pack_message_right_.DIR.OMP.PARALLEL.LOOP.2.split100+0x118> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
XCHG %AX,%AX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
Name | Coverage (%) | Time (s) |
---|---|---|
▼clover_pack_message_right_.DIR.OMP.PARALLEL.LOOP.2.split100– | 0.04 | 0.01 |
▼Loop 440 - pack_kernel.f90:156-160 - exec– | 0.04 | 0.03 |
○Loop 441 - pack_kernel.f90:158-160 - exec | 0 | 0 |