Loop Id: 244 | Module: exec | Source: advec_mom_kernel.f90:81-177 [...] | Coverage: 0.01% |
---|
Loop Id: 244 | Module: exec | Source: advec_mom_kernel.f90:81-177 [...] | Coverage: 0.01% |
---|
0x4334a0 MOV -0x40(%RBP),%RCX |
0x4334a4 LEA 0x1(%RCX),%RAX |
0x4334a8 INCQ -0x90(%RBP) |
0x4334af MOV -0x60(%RBP),%RDX |
0x4334b3 MOV -0x88(%RBP),%R10 |
0x4334ba ADD %RDX,%R10 |
0x4334bd ADD %RDX,-0xa8(%RBP) |
0x4334c4 CMP -0x70(%RBP),%RCX |
0x4334c8 MOV %RAX,-0x40(%RBP) |
0x4334cc JE 432b3e |
0x4334d2 MOV %R10,-0x88(%RBP) |
0x4334d9 MOV -0x80(%RBP),%EAX |
0x4334dc CMP -0x78(%RBP),%EAX |
0x4334df MOV -0x58(%RBP),%RCX |
0x4334e3 JL 4334a0 |
0x4334e5 MOV 0x108(%RBP),%RAX |
0x4334ec MOV (%RAX),%R14 |
0x4334ef MOV 0x130(%RBP),%RAX |
0x4334f6 MOV (%RAX),%R9 |
0x4334f9 MOV -0xc0(%RBP),%RDX |
0x433500 AND $-0x2,%RDX |
0x433504 MOV %R9,-0x50(%RBP) |
0x433508 JE 4338af |
0x43350e LEA -0x1(%RDX),%RDI |
0x433512 MOV -0x98(%RBP),%RAX |
0x433519 MOV -0x40(%RBP),%R8 |
0x43351d ADD %R8,%RAX |
0x433520 SUB -0xa0(%RBP),%RAX |
0x433527 MOV %R9,%R8 |
0x43352a IMUL %RAX,%R8 |
0x43352e MOVQ %R8,%XMM0 |
0x433533 MOV %R14,%R12 |
0x433536 IMUL -0x90(%RBP),%R12 |
0x43353e ADD -0xe0(%RBP),%R12 |
0x433545 MOVQ 0x40(%RBP),%XMM2 |
0x43354a PADDQ %XMM2,%XMM0 |
0x43354e PSHUFD $0x44,%XMM0,%XMM15 |
0x433554 MOVQ %RSI,%XMM0 |
0x433559 PSHUFD $0x44,%XMM0,%XMM4 |
0x43355e MOVQ 0x88(%RBP),%XMM0 |
0x433566 IMUL -0x48(%RBP),%RAX |
0x43356b MOVQ %RAX,%XMM2 |
0x433570 PADDQ %XMM0,%XMM2 |
0x433574 PSHUFD $0x44,%XMM2,%XMM2 |
0x433579 XOR %R9D,%R9D |
0x43357c MOV -0xd8(%RBP),%R11 |
0x433583 MOV -0x88(%RBP),%RBX |
0x43358a JMP 43362c |
(246) 0x433590 DIVPD %XMM3,%XMM0 |
(246) 0x433594 ADDPD %XMM6,%XMM0 |
(246) 0x433598 MULPD 0xfa060(%RIP),%XMM5 |
(246) 0x4335a0 MULPD %XMM0,%XMM5 |
(246) 0x4335a4 MOVAPD %XMM9,%XMM0 |
(246) 0x4335a9 CMPPD $0x2,%XMM14,%XMM0 |
(246) 0x4335af ANDPD %XMM0,%XMM9 |
(246) 0x4335b4 ANDNPD %XMM14,%XMM0 |
(246) 0x4335b9 ORPD %XMM9,%XMM0 |
(246) 0x4335be MOVAPD %XMM5,%XMM3 |
(246) 0x4335c2 CMPPD $0x2,%XMM0,%XMM3 |
(246) 0x4335c7 ANDPD %XMM3,%XMM5 |
(246) 0x4335cb ANDNPD %XMM0,%XMM3 |
(246) 0x4335cf ORPD %XMM5,%XMM3 |
(246) 0x4335d3 XORPD %XMM14,%XMM14 |
(246) 0x4335d8 CMPPD $0x2,%XMM14,%XMM12 |
(246) 0x4335de MOVAPD %XMM12,%XMM0 |
(246) 0x4335e3 ANDNPD %XMM3,%XMM0 |
(246) 0x4335e7 XORPD 0x12f841(%RIP),%XMM3 |
(246) 0x4335ef ANDPD %XMM12,%XMM3 |
(246) 0x4335f4 ORPD %XMM0,%XMM3 |
(246) 0x4335f8 ANDPD %XMM3,%XMM10 |
(246) 0x4335fd MOVAPD 0xf9fdb(%RIP),%XMM0 |
(246) 0x433605 SUBPD %XMM11,%XMM0 |
(246) 0x43360a MULPD %XMM10,%XMM0 |
(246) 0x43360f ADDPD %XMM13,%XMM0 |
(246) 0x433614 MULPD %XMM8,%XMM0 |
(246) 0x433619 MOVUPD %XMM0,(%RBX,%R9,8) |
(246) 0x43361f ADD $0x2,%R9 |
(246) 0x433623 CMP %RDI,%R9 |
(246) 0x433626 JA 433880 |
(246) 0x43362c MOVUPD (%R12,%R9,8),%XMM8 |
(246) 0x433632 MOVAPD %XMM8,%XMM7 |
(246) 0x433637 CMPPD $0x1,%XMM14,%XMM7 |
(246) 0x43363d MOV -0x30(%RBP),%RAX |
(246) 0x433641 ADD %R9D,%EAX |
(246) 0x433644 MOVD %EAX,%XMM0 |
(246) 0x433648 PSHUFD $0x50,%XMM0,%XMM9 |
(246) 0x43364e MOVDQA %XMM9,%XMM11 |
(246) 0x433653 PADDD 0xf9f54(%RIP),%XMM11 |
(246) 0x43365c MOVDQA %XMM9,%XMM14 |
(246) 0x433661 PADDD 0xf9f66(%RIP),%XMM14 |
(246) 0x43366a MOVDQA %XMM9,%XMM5 |
(246) 0x43366f PADDD 0xf9f49(%RIP),%XMM5 |
(246) 0x433677 PADDD 0xf9f20(%RIP),%XMM9 |
(246) 0x433680 PXOR %XMM0,%XMM0 |
(246) 0x433684 PCMPGTD %XMM5,%XMM0 |
(246) 0x433688 PUNPCKLDQ %XMM0,%XMM5 |
(246) 0x43368c MOVAPD %XMM7,%XMM10 |
(246) 0x433691 ANDNPD %XMM5,%XMM10 |
(246) 0x433696 XORPD %XMM3,%XMM3 |
(246) 0x43369a PCMPGTD %XMM11,%XMM3 |
(246) 0x43369f MOVDQA %XMM11,%XMM0 |
(246) 0x4336a4 PUNPCKLDQ %XMM3,%XMM0 |
(246) 0x4336a8 MOVAPD %XMM7,%XMM3 |
(246) 0x4336ac ANDNPD %XMM0,%XMM3 |
(246) 0x4336b0 ANDPD %XMM7,%XMM0 |
(246) 0x4336b4 ORPD %XMM10,%XMM0 |
(246) 0x4336b9 PSUBQ %XMM4,%XMM0 |
(246) 0x4336bd PSLLQ $0x3,%XMM0 |
(246) 0x4336c2 MOVDQA %XMM2,%XMM6 |
(246) 0x4336c6 PADDQ %XMM0,%XMM6 |
(246) 0x4336ca MOVQ %XMM6,%RAX |
(246) 0x4336cf PSHUFD $-0x12,%XMM6,%XMM6 |
(246) 0x4336d4 MOVQ %XMM6,%R8 |
(246) 0x4336d9 PXOR %XMM6,%XMM6 |
(246) 0x4336dd PCMPGTD %XMM14,%XMM6 |
(246) 0x4336e2 MOVDQA %XMM14,%XMM10 |
(246) 0x4336e7 PUNPCKLDQ %XMM6,%XMM10 |
(246) 0x4336ec MOVAPD %XMM7,%XMM6 |
(246) 0x4336f0 ANDNPD %XMM10,%XMM6 |
(246) 0x4336f5 PXOR %XMM10,%XMM10 |
(246) 0x4336fa PCMPGTD %XMM9,%XMM10 |
(246) 0x4336ff PUNPCKLDQ %XMM10,%XMM9 |
(246) 0x433704 MOVSD (%RAX),%XMM13 |
(246) 0x433709 MOVHPD (%R8),%XMM13 |
(246) 0x43370e PAND %XMM7,%XMM9 |
(246) 0x433713 POR %XMM6,%XMM9 |
(246) 0x433718 PSUBQ %XMM4,%XMM9 |
(246) 0x43371d PSLLQ $0x3,%XMM9 |
(246) 0x433723 PADDQ %XMM2,%XMM9 |
(246) 0x433728 MOVQ %XMM9,%RAX |
(246) 0x43372d PSHUFD $-0x12,%XMM9,%XMM6 |
(246) 0x433733 MOVQ %XMM6,%R8 |
(246) 0x433738 MOVSD (%RAX),%XMM6 |
(246) 0x43373c MOVHPD (%R8),%XMM6 |
(246) 0x433741 MOVAPD %XMM13,%XMM9 |
(246) 0x433746 SUBPD %XMM6,%XMM9 |
(246) 0x43374b ANDPD %XMM7,%XMM5 |
(246) 0x43374f ORPD %XMM3,%XMM5 |
(246) 0x433753 PSUBQ %XMM4,%XMM5 |
(246) 0x433757 PSLLQ $0x3,%XMM5 |
(246) 0x43375c PADDQ %XMM2,%XMM5 |
(246) 0x433760 MOVQ %XMM5,%RAX |
(246) 0x433765 PSHUFD $-0x12,%XMM5,%XMM3 |
(246) 0x43376a MOVQ %XMM3,%R8 |
(246) 0x43376f MOVSD (%RAX),%XMM12 |
(246) 0x433774 MOVHPD (%R8),%XMM12 |
(246) 0x433779 SUBPD %XMM13,%XMM12 |
(246) 0x43377e MOVAPD %XMM12,%XMM3 |
(246) 0x433783 MULPD %XMM9,%XMM3 |
(246) 0x433788 LEA (%R11,%R9,8),%R8 |
(246) 0x43378c PXOR %XMM10,%XMM10 |
(246) 0x433791 CMPPD $0x1,%XMM3,%XMM10 |
(246) 0x433797 MOVMSKPD %XMM10,%EAX |
(246) 0x43379c TEST $0x1,%AL |
(246) 0x43379e JE 4337a5 |
(246) 0x4337a0 MOVSD (%R8),%XMM5 |
(246) 0x4337a5 TEST $0x2,%AL |
(246) 0x4337a7 JE 4337af |
(246) 0x4337a9 MOVHPD 0x8(%R8),%XMM5 |
(246) 0x4337af SHUFPS $-0x18,%XMM7,%XMM7 |
(246) 0x4337b3 PAND %XMM7,%XMM11 |
(246) 0x4337b8 PANDN %XMM14,%XMM7 |
(246) 0x4337bd POR %XMM11,%XMM7 |
(246) 0x4337c2 MOVAPD %XMM8,%XMM11 |
(246) 0x4337c7 ANDPD %XMM1,%XMM11 |
(246) 0x4337cc PADDQ %XMM15,%XMM0 |
(246) 0x4337d1 MOVQ %XMM0,%R8 |
(246) 0x4337d6 PSHUFD $-0x12,%XMM0,%XMM0 |
(246) 0x4337db MOVQ %XMM0,%R10 |
(246) 0x4337e0 MOVSD (%R8),%XMM0 |
(246) 0x4337e5 MOVHPD (%R10),%XMM0 |
(246) 0x4337ea DIVPD %XMM0,%XMM11 |
(246) 0x4337ef MOVAPD %XMM11,%XMM0 |
(246) 0x4337f4 ADDPD 0xf9de4(%RIP),%XMM0 |
(246) 0x4337fc MOVQ 0x20(%RBP),%XMM3 |
(246) 0x433801 PSHUFD $0x44,%XMM3,%XMM3 |
(246) 0x433806 XORPD %XMM6,%XMM6 |
(246) 0x43380a PCMPGTD %XMM7,%XMM6 |
(246) 0x43380e PUNPCKLDQ %XMM6,%XMM7 |
(246) 0x433812 PSUBQ %XMM4,%XMM7 |
(246) 0x433816 PSLLQ $0x3,%XMM7 |
(246) 0x43381b PADDQ %XMM3,%XMM7 |
(246) 0x43381f TEST $0x1,%AL |
(246) 0x433821 JE 43382d |
(246) 0x433823 MOVQ %XMM7,%R8 |
(246) 0x433828 MOVSD (%R8),%XMM3 |
(246) 0x43382d ANDPD %XMM1,%XMM9 |
(246) 0x433832 MOVAPD %XMM12,%XMM14 |
(246) 0x433837 ANDPD %XMM1,%XMM14 |
(246) 0x43383c MOVAPD 0xf9dac(%RIP),%XMM6 |
(246) 0x433844 SUBPD %XMM11,%XMM6 |
(246) 0x433849 MULPD %XMM14,%XMM6 |
(246) 0x43384e DIVPD %XMM5,%XMM6 |
(246) 0x433852 MULPD %XMM9,%XMM0 |
(246) 0x433857 TEST $0x2,%AL |
(246) 0x433859 JE 433590 |
(246) 0x43385f PSHUFD $-0x12,%XMM7,%XMM7 |
(246) 0x433864 MOVQ %XMM7,%RAX |
(246) 0x433869 MOVHPD (%RAX),%XMM3 |
(246) 0x43386d JMP 433590 |
0x433880 CMP %RDX,-0xc0(%RBP) |
0x433887 XORPD %XMM10,%XMM10 |
0x43388c MOVSD 0xf9bcb(%RIP),%XMM11 |
0x433895 MOVSD 0x1315aa(%RIP),%XMM12 |
0x43389e MOVSD 0xf9bc1(%RIP),%XMM13 |
0x4338a7 JE 4334a0 |
0x4338ad JMP 4338b1 |
0x4338af XOR %EDX,%EDX |
0x4338b1 MOV -0xc0(%RBP),%RDI |
0x4338b8 SUB %RDX,%RDI |
0x4338bb MOV -0xf0(%RBP),%RAX |
0x4338c2 ADD %EDX,%EAX |
0x4338c4 MOV %RAX,-0x38(%RBP) |
0x4338c8 MOV -0x30(%RBP),%RAX |
0x4338cc LEA (%RAX,%RDX,1),%R15D |
0x4338d0 LEA (%RCX,%RDX,1),%RAX |
0x4338d4 MOV -0xa8(%RBP),%R8 |
0x4338db LEA (%R8,%RAX,8),%RBX |
0x4338df MOV -0xe8(%RBP),%R8 |
0x4338e6 LEA (%R8,%RAX,8),%R8 |
0x4338ea MOV %R8,-0x68(%RBP) |
0x4338ee IMUL -0x90(%RBP),%R14 |
0x4338f6 LEA (%R14,%RAX,8),%R11 |
0x4338fa ADD -0xf8(%RBP),%R11 |
0x433901 MOV -0x98(%RBP),%RAX |
0x433908 MOV -0x40(%RBP),%R8 |
0x43390c LEA (%RAX,%R8,1),%R13 |
0x433910 SUB -0xa0(%RBP),%R13 |
0x433917 XOR %EAX,%EAX |
0x433919 JMP 43394a |
(245) 0x433920 MOVAPD %XMM12,%XMM0 |
(245) 0x433925 SUBSD %XMM4,%XMM0 |
(245) 0x433929 MULSD %XMM8,%XMM0 |
(245) 0x43392e ADDSD %XMM5,%XMM0 |
(245) 0x433932 MULSD %XMM2,%XMM0 |
(245) 0x433936 MOVSD %XMM0,(%RBX,%RAX,8) |
(245) 0x43393b INC %RDX |
(245) 0x43393e INC %RAX |
(245) 0x433941 CMP %RAX,%RDI |
(245) 0x433944 JE 4334a0 |
(245) 0x43394a LEA (%RDX,%RCX,1),%R9D |
(245) 0x43394e DEC %R9D |
(245) 0x433951 MOVSD (%R11,%RAX,8),%XMM2 |
(245) 0x433957 LEA (%R15,%RAX,1),%R8 |
(245) 0x43395b UCOMISD %XMM2,%XMM10 |
(245) 0x433960 JBE 433970 |
(245) 0x433962 MOV -0x30(%RBP),%R10 |
(245) 0x433966 LEA (%R10,%RDX,1),%R14D |
(245) 0x43396a LEA 0x1(%R14),%R10D |
(245) 0x43396e JMP 433983 |
(245) 0x433970 MOV -0x38(%RBP),%R10 |
(245) 0x433974 ADD %EAX,%R10D |
(245) 0x433977 MOV %R8D,%R14D |
(245) 0x43397a MOV %R9D,%R8D |
(245) 0x43397d MOV %R14D,%R9D |
(245) 0x433980 MOV %R10D,%R14D |
(245) 0x433983 MOVAPD %XMM2,%XMM4 |
(245) 0x433987 ANDPD %XMM1,%XMM4 |
(245) 0x43398b MOV -0x50(%RBP),%R12 |
(245) 0x43398f IMUL %R13,%R12 |
(245) 0x433993 ADD 0x40(%RBP),%R12 |
(245) 0x433997 MOVSXD %R8D,%R8 |
(245) 0x43399a SUB %RSI,%R8 |
(245) 0x43399d DIVSD (%R12,%R8,8),%XMM4 |
(245) 0x4339a3 MOV -0x48(%RBP),%R12 |
(245) 0x4339a7 IMUL %R13,%R12 |
(245) 0x4339ab ADD 0x88(%RBP),%R12 |
(245) 0x4339b2 MOVSXD %R10D,%R10 |
(245) 0x4339b5 SUB %RSI,%R10 |
(245) 0x4339b8 MOVSXD %R9D,%R9 |
(245) 0x4339bb SUB %RSI,%R9 |
(245) 0x4339be MOVSD (%R12,%R8,8),%XMM5 |
(245) 0x4339c4 MOVAPD %XMM5,%XMM0 |
(245) 0x4339c8 MOVHPD (%R12,%R9,8),%XMM0 |
(245) 0x4339ce MOVSD (%R12,%R10,8),%XMM3 |
(245) 0x4339d4 UNPCKLPD %XMM5,%XMM3 |
(245) 0x4339d8 SUBPD %XMM3,%XMM0 |
(245) 0x4339dc MOVAPD %XMM0,%XMM7 |
(245) 0x4339e0 UNPCKHPD %XMM0,%XMM7 |
(245) 0x4339e4 MOVAPD %XMM7,%XMM3 |
(245) 0x4339e8 MULSD %XMM0,%XMM3 |
(245) 0x4339ec XORPD %XMM8,%XMM8 |
(245) 0x4339f1 UCOMISD %XMM8,%XMM3 |
(245) 0x4339f6 JBE 433920 |
(245) 0x4339fc ANDPD %XMM1,%XMM0 |
(245) 0x433a00 MOVAPD %XMM11,%XMM3 |
(245) 0x433a05 SUBSD %XMM4,%XMM3 |
(245) 0x433a09 MOVAPD %XMM4,%XMM6 |
(245) 0x433a0d ADDSD %XMM12,%XMM6 |
(245) 0x433a12 UNPCKLPD %XMM3,%XMM6 |
(245) 0x433a16 MOVSXD %R14D,%R8 |
(245) 0x433a19 SUB %RSI,%R8 |
(245) 0x433a1c MOV -0x68(%RBP),%R9 |
(245) 0x433a20 MOVSD (%R9,%RAX,8),%XMM3 |
(245) 0x433a26 MULPD %XMM0,%XMM6 |
(245) 0x433a2a MOV 0x20(%RBP),%R9 |
(245) 0x433a2e MOVSD (%R9,%R8,8),%XMM8 |
(245) 0x433a34 UNPCKLPD %XMM3,%XMM8 |
(245) 0x433a39 DIVPD %XMM8,%XMM6 |
(245) 0x433a3e MOVAPD %XMM6,%XMM8 |
(245) 0x433a43 UNPCKHPD %XMM6,%XMM8 |
(245) 0x433a48 ADDSD %XMM6,%XMM8 |
(245) 0x433a4d MULSD %XMM13,%XMM3 |
(245) 0x433a52 MULSD %XMM8,%XMM3 |
(245) 0x433a57 PSHUFD $-0x12,%XMM0,%XMM6 |
(245) 0x433a5c MOVAPD %XMM0,%XMM9 |
(245) 0x433a61 CMPSD $0x2,%XMM6,%XMM9 |
(245) 0x433a67 ANDPD %XMM9,%XMM0 |
(245) 0x433a6c ANDNPD %XMM6,%XMM9 |
(245) 0x433a71 ORPD %XMM0,%XMM9 |
(245) 0x433a76 MOVAPD %XMM3,%XMM8 |
(245) 0x433a7b CMPSD $0x2,%XMM9,%XMM8 |
(245) 0x433a81 ANDPD %XMM8,%XMM3 |
(245) 0x433a86 ANDNPD %XMM9,%XMM8 |
(245) 0x433a8b ORPD %XMM3,%XMM8 |
(245) 0x433a90 CMPSD $0x2,%XMM10,%XMM7 |
(245) 0x433a96 MOVAPD %XMM7,%XMM0 |
(245) 0x433a9a ANDNPD %XMM8,%XMM0 |
(245) 0x433a9f XORPD 0x12f388(%RIP),%XMM8 |
(245) 0x433aa8 ANDPD %XMM7,%XMM8 |
(245) 0x433aad ORPD %XMM0,%XMM8 |
(245) 0x433ab2 JMP 433920 |
/beegfs/hackathon/users/eoseret/qaas_runs/170-861-0321/intel/CloverLeafFC/build/CloverLeafFC/CloverLeaf_ref/kernels/advec_mom_kernel.f90: 81 - 177 |
-------------------------------------------------------------------------------- |
81: IF(mom_sweep.EQ.1)THEN ! x 1 |
[...] |
150: DO k=y_min,y_max+1 |
151: DO j=x_min-1,x_max+1 |
152: IF(node_flux(j,k).LT.0.0)THEN |
[...] |
158: upwind=j-1 |
159: donor=j |
160: downwind=j+1 |
161: dif=upwind |
162: ENDIF |
163: sigma=ABS(node_flux(j,k))/(node_mass_pre(donor,k)) |
164: width=celldx(j) |
165: vdiffuw=vel1(donor,k)-vel1(upwind,k) |
166: vdiffdw=vel1(downwind,k)-vel1(donor,k) |
167: limiter=0.0 |
168: IF(vdiffuw*vdiffdw.GT.0.0)THEN |
169: auw=ABS(vdiffuw) |
170: adw=ABS(vdiffdw) |
171: wind=1.0_8 |
172: IF(vdiffdw.LE.0.0) wind=-1.0_8 |
173: limiter=wind*MIN(width*((2.0_8-sigma)*adw/width+(1.0_8+sigma)*auw/celldx(dif))/6.0_8,auw,adw) |
174: ENDIF |
175: advec_vel_s=vel1(donor,k)+(1.0-sigma)*limiter |
176: mom_flux(j,k)=advec_vel_s*node_flux(j,k) |
177: ENDDO |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 3.74 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 4.47 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.04 |
Bottlenecks | P5, P6, P7, |
Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
Source | advec_mom_kernel.f90:81-81,advec_mom_kernel.f90:150-152,advec_mom_kernel.f90:163-165 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 14.33 |
CQA cycles if no scalar integer | 3.83 |
CQA cycles if FP arith vectorized | 14.33 |
CQA cycles if fully vectorized | 3.21 |
Front-end cycles | 13.83 |
DIV/SQRT cycles | 7.50 |
P0 cycles | 7.50 |
P1 cycles | 7.25 |
P2 cycles | 7.25 |
P3 cycles | 3.50 |
P4 cycles | 14.33 |
P5 cycles | 14.33 |
P6 cycles | 14.33 |
P7 cycles | 2.00 |
P8 cycles | 2.00 |
P9 cycles | 2.00 |
P10 cycles | 2.00 |
P11 cycles | 0.00 |
P12 cycles | 0.00 |
P13 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 78.00 |
Nb uops | 83.00 |
Nb loads | 38.00 |
Nb stores | 7.00 |
Nb stack references | 26.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 24.56 |
Bytes prefetched | 0.00 |
Bytes loaded | 296.00 |
Bytes stored | 56.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 16.67 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | 0.00 |
Vectorization ratio add_sub | 33.33 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 33.33 |
Vector-efficiency ratio all | 14.24 |
Vector-efficiency ratio load | 12.15 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | 12.50 |
Vector-efficiency ratio add_sub | 16.67 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 15.63 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 3.74 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 4.47 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.04 |
Bottlenecks | P5, P6, P7, |
Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
Source | advec_mom_kernel.f90:81-81,advec_mom_kernel.f90:150-152,advec_mom_kernel.f90:163-165 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 14.33 |
CQA cycles if no scalar integer | 3.83 |
CQA cycles if FP arith vectorized | 14.33 |
CQA cycles if fully vectorized | 3.21 |
Front-end cycles | 13.83 |
DIV/SQRT cycles | 7.50 |
P0 cycles | 7.50 |
P1 cycles | 7.25 |
P2 cycles | 7.25 |
P3 cycles | 3.50 |
P4 cycles | 14.33 |
P5 cycles | 14.33 |
P6 cycles | 14.33 |
P7 cycles | 2.00 |
P8 cycles | 2.00 |
P9 cycles | 2.00 |
P10 cycles | 2.00 |
P11 cycles | 0.00 |
P12 cycles | 0.00 |
P13 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 78.00 |
Nb uops | 83.00 |
Nb loads | 38.00 |
Nb stores | 7.00 |
Nb stack references | 26.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 24.56 |
Bytes prefetched | 0.00 |
Bytes loaded | 296.00 |
Bytes stored | 56.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 16.67 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | 0.00 |
Vectorization ratio add_sub | 33.33 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 33.33 |
Vector-efficiency ratio all | 14.24 |
Vector-efficiency ratio load | 12.15 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | 12.50 |
Vector-efficiency ratio add_sub | 16.67 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 15.63 |
Path / |
Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
Source file and lines | advec_mom_kernel.f90:81-177 |
Module | exec |
nb instructions | 78 |
nb uops | 83 |
loop length | 394 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 8 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 26 |
micro-operation queue | 13.83 cycles |
front end | 13.83 cycles |
ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 7.50 | 7.50 | 7.25 | 7.25 | 3.50 | 14.33 | 14.33 | 14.33 | 2.00 | 2.00 | 2.00 | 2.00 | 0.00 | 0.00 |
cycles | 7.50 | 7.50 | 7.25 | 7.25 | 3.50 | 14.33 | 14.33 | 14.33 | 2.00 | 2.00 | 2.00 | 2.00 | 0.00 | 0.00 |
Cycles executing div or sqrt instructions | NA |
Front-end | 13.83 |
Dispatch | 14.33 |
Overall L1 | 14.33 |
all | 15% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 33% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 27% |
all | 25% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 100% |
all | 16% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 33% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 33% |
all | 14% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 16% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 14% |
all | 15% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 25% |
all | 14% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 16% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 15% |
Instruction | Nb FU | ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MOV -0x40(%RBP),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
LEA 0x1(%RCX),%RAX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
INCQ -0x90(%RBP) | 2 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
MOV -0x60(%RBP),%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV -0x88(%RBP),%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
ADD %RDX,%R10 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD %RDX,-0xa8(%RBP) | 2 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
CMP -0x70(%RBP),%RCX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
MOV %RAX,-0x40(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
JE 432b3e <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x89e> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
MOV %R10,-0x88(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV -0x80(%RBP),%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
CMP -0x78(%RBP),%EAX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
MOV -0x58(%RBP),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
JL 4334a0 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x1200> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
MOV 0x108(%RBP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV (%RAX),%R14 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV 0x130(%RBP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV (%RAX),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV -0xc0(%RBP),%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
AND $-0x2,%RDX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV %R9,-0x50(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
JE 4338af <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x160f> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
LEA -0x1(%RDX),%RDI | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV -0x98(%RBP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV -0x40(%RBP),%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
ADD %R8,%RAX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
SUB -0xa0(%RBP),%RAX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
MOV %R9,%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
IMUL %RAX,%R8 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOVQ %R8,%XMM0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 6 | 1 |
MOV %R14,%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
IMUL -0x90(%RBP),%R12 | 1 | 0 | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD -0xe0(%RBP),%R12 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
MOVQ 0x40(%RBP),%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
PADDQ %XMM2,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.25 |
PSHUFD $0x44,%XMM0,%XMM15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 |
MOVQ %RSI,%XMM0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 6 | 1 |
PSHUFD $0x44,%XMM0,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 |
MOVQ 0x88(%RBP),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
IMUL -0x48(%RBP),%RAX | 1 | 0 | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOVQ %RAX,%XMM2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 6 | 1 |
PADDQ %XMM0,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.25 |
PSHUFD $0x44,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 |
XOR %R9D,%R9D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV -0xd8(%RBP),%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV -0x88(%RBP),%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
JMP 43362c <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x138c> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
CMP %RDX,-0xc0(%RBP) | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
XORPD %XMM10,%XMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOVSD 0xf9bcb(%RIP),%XMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
MOVSD 0x1315aa(%RIP),%XMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
MOVSD 0xf9bc1(%RIP),%XMM13 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
JE 4334a0 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x1200> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
JMP 4338b1 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x1611> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV -0xc0(%RBP),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
SUB %RDX,%RDI | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV -0xf0(%RBP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
ADD %EDX,%EAX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV %RAX,-0x38(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV -0x30(%RBP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
LEA (%RAX,%RDX,1),%R15D | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LEA (%RCX,%RDX,1),%RAX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV -0xa8(%RBP),%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
LEA (%R8,%RAX,8),%RBX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV -0xe8(%RBP),%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
LEA (%R8,%RAX,8),%R8 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV %R8,-0x68(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
IMUL -0x90(%RBP),%R14 | 1 | 0 | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LEA (%R14,%RAX,8),%R11 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD -0xf8(%RBP),%R11 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
MOV -0x98(%RBP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV -0x40(%RBP),%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
LEA (%RAX,%R8,1),%R13 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
SUB -0xa0(%RBP),%R13 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
XOR %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
JMP 43394a <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x16aa> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
Source file and lines | advec_mom_kernel.f90:81-177 |
Module | exec |
nb instructions | 78 |
nb uops | 83 |
loop length | 394 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 8 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 26 |
micro-operation queue | 13.83 cycles |
front end | 13.83 cycles |
ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 7.50 | 7.50 | 7.25 | 7.25 | 3.50 | 14.33 | 14.33 | 14.33 | 2.00 | 2.00 | 2.00 | 2.00 | 0.00 | 0.00 |
cycles | 7.50 | 7.50 | 7.25 | 7.25 | 3.50 | 14.33 | 14.33 | 14.33 | 2.00 | 2.00 | 2.00 | 2.00 | 0.00 | 0.00 |
Cycles executing div or sqrt instructions | NA |
Front-end | 13.83 |
Dispatch | 14.33 |
Overall L1 | 14.33 |
all | 15% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 33% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 27% |
all | 25% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 100% |
all | 16% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 33% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 33% |
all | 14% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 16% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 14% |
all | 15% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 25% |
all | 14% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 16% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 15% |
Instruction | Nb FU | ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MOV -0x40(%RBP),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
LEA 0x1(%RCX),%RAX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
INCQ -0x90(%RBP) | 2 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
MOV -0x60(%RBP),%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV -0x88(%RBP),%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
ADD %RDX,%R10 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD %RDX,-0xa8(%RBP) | 2 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
CMP -0x70(%RBP),%RCX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
MOV %RAX,-0x40(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
JE 432b3e <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x89e> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
MOV %R10,-0x88(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV -0x80(%RBP),%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
CMP -0x78(%RBP),%EAX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
MOV -0x58(%RBP),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
JL 4334a0 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x1200> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
MOV 0x108(%RBP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV (%RAX),%R14 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV 0x130(%RBP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV (%RAX),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV -0xc0(%RBP),%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
AND $-0x2,%RDX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV %R9,-0x50(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
JE 4338af <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x160f> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
LEA -0x1(%RDX),%RDI | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV -0x98(%RBP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV -0x40(%RBP),%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
ADD %R8,%RAX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
SUB -0xa0(%RBP),%RAX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
MOV %R9,%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
IMUL %RAX,%R8 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOVQ %R8,%XMM0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 6 | 1 |
MOV %R14,%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
IMUL -0x90(%RBP),%R12 | 1 | 0 | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD -0xe0(%RBP),%R12 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
MOVQ 0x40(%RBP),%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
PADDQ %XMM2,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.25 |
PSHUFD $0x44,%XMM0,%XMM15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 |
MOVQ %RSI,%XMM0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 6 | 1 |
PSHUFD $0x44,%XMM0,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 |
MOVQ 0x88(%RBP),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
IMUL -0x48(%RBP),%RAX | 1 | 0 | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOVQ %RAX,%XMM2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 6 | 1 |
PADDQ %XMM0,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.25 |
PSHUFD $0x44,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 |
XOR %R9D,%R9D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV -0xd8(%RBP),%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV -0x88(%RBP),%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
JMP 43362c <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x138c> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
CMP %RDX,-0xc0(%RBP) | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
XORPD %XMM10,%XMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOVSD 0xf9bcb(%RIP),%XMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
MOVSD 0x1315aa(%RIP),%XMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
MOVSD 0xf9bc1(%RIP),%XMM13 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
JE 4334a0 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x1200> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
JMP 4338b1 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x1611> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV -0xc0(%RBP),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
SUB %RDX,%RDI | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV -0xf0(%RBP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
ADD %EDX,%EAX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV %RAX,-0x38(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV -0x30(%RBP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
LEA (%RAX,%RDX,1),%R15D | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LEA (%RCX,%RDX,1),%RAX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV -0xa8(%RBP),%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
LEA (%R8,%RAX,8),%RBX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV -0xe8(%RBP),%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
LEA (%R8,%RAX,8),%R8 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV %R8,-0x68(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
IMUL -0x90(%RBP),%R14 | 1 | 0 | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LEA (%R14,%RAX,8),%R11 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD -0xf8(%RBP),%R11 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
MOV -0x98(%RBP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV -0x40(%RBP),%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
LEA (%RAX,%R8,1),%R13 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
SUB -0xa0(%RBP),%R13 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
XOR %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
JMP 43394a <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x16aa> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |