| Loop Id: 29 | Module: attention-avx512 | Source: attention.cpp:27-33 | Coverage: 0.70% |
|---|
| Loop Id: 29 | Module: attention-avx512 | Source: attention.cpp:27-33 | Coverage: 0.70% |
|---|
0x408480 MOV 0x88(%RSP),%R15 |
0x408488 VCVTSD2SS %XMM22,%XMM22,%XMM1 |
0x40848e LEA (%RAX,%R8,1),%EDX |
0x408492 MOV 0x100(%RSP),%RBX |
0x40849a VMOVSS %XMM1,(%RBX,%RDX,4) |
0x40849f VCVTSD2SS %XMM21,%XMM21,%XMM1 |
0x4084a5 LEA 0x1(%RAX,%R8,1),%EDX |
0x4084aa VMOVSS %XMM1,(%RBX,%RDX,4) |
0x4084af VCVTSD2SS %XMM20,%XMM20,%XMM1 |
0x4084b5 LEA 0x2(%RAX,%R8,1),%EDX |
0x4084ba VMOVSS %XMM1,(%RBX,%RDX,4) |
0x4084bf VCVTSD2SS %XMM19,%XMM19,%XMM1 |
0x4084c5 LEA 0x3(%RAX,%R8,1),%EDX |
0x4084ca VMOVSS %XMM1,(%RBX,%RDX,4) |
0x4084cf VCVTSD2SS %XMM18,%XMM18,%XMM1 |
0x4084d5 LEA 0x4(%RAX,%R8,1),%EDX |
0x4084da VMOVSS %XMM1,(%RBX,%RDX,4) |
0x4084df VCVTSD2SS %XMM17,%XMM17,%XMM1 |
0x4084e5 LEA 0x5(%RAX,%R8,1),%EDX |
0x4084ea VMOVSS %XMM1,(%RBX,%RDX,4) |
0x4084ef VCVTSD2SS %XMM16,%XMM16,%XMM1 |
0x4084f5 LEA 0x6(%RAX,%R8,1),%EDX |
0x4084fa VMOVSS %XMM1,(%RBX,%RDX,4) |
0x4084ff VCVTSD2SS %XMM23,%XMM23,%XMM1 |
0x408505 LEA 0x7(%RAX,%R8,1),%EDX |
0x40850a VMOVSS %XMM1,(%RBX,%RDX,4) |
0x40850f ADD $0x8,%ESI |
0x408512 CMP 0x180(%RSP),%RCX |
0x40851a LEA 0x1(%RCX),%RCX |
0x40851e JE 4082ab |
0x408524 LEA (,%RCX,8),%R8D |
0x40852c TEST %RDI,%RDI |
0x40852f JE 408880 |
0x408535 VPBROADCASTD %R8D,%YMM22 |
0x40853b VXORPD %XMM15,%XMM15,%XMM15 |
0x408540 VXORPD %XMM16,%XMM16,%XMM16 |
0x408546 VXORPD %XMM17,%XMM17,%XMM17 |
0x40854c VXORPD %XMM18,%XMM18,%XMM18 |
0x408552 VXORPD %XMM19,%XMM19,%XMM19 |
0x408558 VXORPD %XMM20,%XMM20,%XMM20 |
0x40855e VXORPD %XMM21,%XMM21,%XMM21 |
0x408564 VXORPS %XMM23,%XMM23,%XMM23 |
0x40856a XOR %EDX,%EDX |
0x40856c MOV 0x80(%RSP),%R15 |
0x408574 NOPW %CS:(%RAX,%RAX,1) |
(30) 0x408580 VPBROADCASTD %EDX,%YMM24 |
(30) 0x408586 VPADDD %YMM2,%YMM24,%YMM24 |
(30) 0x40858c VPMULLD %YMM24,%YMM26,%YMM24 |
(30) 0x408592 VPADDD %YMM22,%YMM24,%YMM24 |
(30) 0x408598 VPSUBD %YMM3,%YMM24,%YMM25 |
(30) 0x40859e VPXORD %XMM26,%XMM26,%XMM26 |
(30) 0x4085a4 KXNORW %K0,%K0,%K1 |
(30) 0x4085a8 VGATHERDPS (%R9,%YMM25,4),%YMM26{%K1} |
(30) 0x4085af VPADDD %YMM0,%YMM24,%YMM25 |
(30) 0x4085b5 VXORPS %XMM27,%XMM27,%XMM27 |
(30) 0x4085bb KXNORW %K0,%K0,%K1 |
(30) 0x4085bf VGATHERDPS (%R9,%YMM25,4),%YMM27{%K1} |
(30) 0x4085c6 VPADDD %YMM9,%YMM24,%YMM25 |
(30) 0x4085cc VXORPS %XMM28,%XMM28,%XMM28 |
(30) 0x4085d2 KXNORW %K0,%K0,%K1 |
(30) 0x4085d6 VGATHERDPS (%R9,%YMM25,4),%YMM28{%K1} |
(30) 0x4085dd VPADDD %YMM10,%YMM24,%YMM25 |
(30) 0x4085e3 VXORPS %XMM29,%XMM29,%XMM29 |
(30) 0x4085e9 KXNORW %K0,%K0,%K1 |
(30) 0x4085ed VGATHERDPS (%R9,%YMM25,4),%YMM29{%K1} |
(30) 0x4085f4 VPADDD %YMM11,%YMM24,%YMM25 |
(30) 0x4085fa VXORPS %XMM30,%XMM30,%XMM30 |
(30) 0x408600 KXNORW %K0,%K0,%K1 |
(30) 0x408604 VGATHERDPS (%R9,%YMM25,4),%YMM30{%K1} |
(30) 0x40860b VPADDD %YMM12,%YMM24,%YMM25 |
(30) 0x408611 VXORPS %XMM31,%XMM31,%XMM31 |
(30) 0x408617 KXNORW %K0,%K0,%K1 |
(30) 0x40861b VGATHERDPS (%R9,%YMM25,4),%YMM31{%K1} |
(30) 0x408622 VPADDD %YMM13,%YMM24,%YMM25 |
(30) 0x408628 VXORPS %XMM1,%XMM1,%XMM1 |
(30) 0x40862c KXNORW %K0,%K0,%K1 |
(30) 0x408630 VGATHERDPS (%R9,%YMM25,4),%YMM1{%K1} |
(30) 0x408637 LEA (%R11,%RDX,1),%EBX |
(30) 0x40863b VCVTPS2PD (%R15,%RBX,4),%ZMM25 |
(30) 0x408642 VCVTPS2PD %YMM26,%ZMM26 |
(30) 0x408648 VCVTPS2PD %YMM27,%ZMM27 |
(30) 0x40864e VFMADD231PD %ZMM26,%ZMM25,%ZMM23 |
(30) 0x408654 VFMADD231PD %ZMM27,%ZMM25,%ZMM21 |
(30) 0x40865a VCVTPS2PD %YMM28,%ZMM26 |
(30) 0x408660 VCVTPS2PD %YMM29,%ZMM27 |
(30) 0x408666 VFMADD231PD %ZMM26,%ZMM25,%ZMM20 |
(30) 0x40866c VFMADD231PD %ZMM27,%ZMM25,%ZMM19 |
(30) 0x408672 VCVTPS2PD %YMM30,%ZMM26 |
(30) 0x408678 VCVTPS2PD %YMM31,%ZMM27 |
(30) 0x40867e VFMADD231PD %ZMM26,%ZMM25,%ZMM18 |
(30) 0x408684 VFMADD231PD %ZMM27,%ZMM25,%ZMM17 |
(30) 0x40868a VCVTPS2PD %YMM1,%ZMM1 |
(30) 0x408690 VPADDD %YMM14,%YMM24,%YMM24 |
(30) 0x408696 VXORPD %XMM26,%XMM26,%XMM26 |
(30) 0x40869c KXNORW %K0,%K0,%K1 |
(30) 0x4086a0 VGATHERDPS (%R9,%YMM24,4),%YMM26{%K1} |
(30) 0x4086a7 VFMADD231PD %ZMM1,%ZMM25,%ZMM16 |
(30) 0x4086ad VCVTPS2PD %YMM26,%ZMM1 |
(30) 0x4086b3 VMOVDQU64 0x120(%RSP),%YMM26 |
(30) 0x4086bb VFMADD231PD %ZMM1,%ZMM25,%ZMM15 |
(30) 0x4086c1 ADD $0x8,%RDX |
(30) 0x4086c5 CMP %RDI,%RDX |
(30) 0x4086c8 JB 408580 |
0x4086ce VEXTRACTF64X4 $0x1,%ZMM23,%YMM1 |
0x4086d5 VADDPD %ZMM1,%ZMM23,%ZMM1 |
0x4086db VMOVAPD %XMM1,%XMM22 |
0x4086e1 VEXTRACTF128 $0x1,%YMM1,%XMM1 |
0x4086e7 VADDPD %XMM1,%XMM22,%XMM1 |
0x4086ed VSHUFPD $0x1,%XMM1,%XMM1,%XMM22 |
0x4086f4 VADDSD %XMM22,%XMM1,%XMM22 |
0x4086fa VEXTRACTF64X4 $0x1,%ZMM21,%YMM1 |
0x408701 VADDPD %ZMM1,%ZMM21,%ZMM1 |
0x408707 VMOVAPD %XMM1,%XMM21 |
0x40870d VEXTRACTF128 $0x1,%YMM1,%XMM1 |
0x408713 VADDPD %XMM1,%XMM21,%XMM1 |
0x408719 VSHUFPD $0x1,%XMM1,%XMM1,%XMM21 |
0x408720 VADDSD %XMM21,%XMM1,%XMM21 |
0x408726 VEXTRACTF64X4 $0x1,%ZMM20,%YMM1 |
0x40872d VADDPD %ZMM1,%ZMM20,%ZMM1 |
0x408733 VMOVAPD %XMM1,%XMM20 |
0x408739 VEXTRACTF128 $0x1,%YMM1,%XMM1 |
0x40873f VADDPD %XMM1,%XMM20,%XMM1 |
0x408745 VSHUFPD $0x1,%XMM1,%XMM1,%XMM20 |
0x40874c VADDSD %XMM20,%XMM1,%XMM20 |
0x408752 VEXTRACTF64X4 $0x1,%ZMM19,%YMM1 |
0x408759 VADDPD %ZMM1,%ZMM19,%ZMM1 |
0x40875f VMOVAPD %XMM1,%XMM19 |
0x408765 VEXTRACTF128 $0x1,%YMM1,%XMM1 |
0x40876b VADDPD %XMM1,%XMM19,%XMM1 |
0x408771 VSHUFPD $0x1,%XMM1,%XMM1,%XMM19 |
0x408778 VADDSD %XMM19,%XMM1,%XMM19 |
0x40877e VEXTRACTF64X4 $0x1,%ZMM18,%YMM1 |
0x408785 VADDPD %ZMM1,%ZMM18,%ZMM1 |
0x40878b VMOVAPD %XMM1,%XMM18 |
0x408791 VEXTRACTF128 $0x1,%YMM1,%XMM1 |
0x408797 VADDPD %XMM1,%XMM18,%XMM1 |
0x40879d VSHUFPD $0x1,%XMM1,%XMM1,%XMM18 |
0x4087a4 VADDSD %XMM18,%XMM1,%XMM18 |
0x4087aa VEXTRACTF64X4 $0x1,%ZMM17,%YMM1 |
0x4087b1 VADDPD %ZMM1,%ZMM17,%ZMM1 |
0x4087b7 VMOVAPD %XMM1,%XMM17 |
0x4087bd VEXTRACTF128 $0x1,%YMM1,%XMM1 |
0x4087c3 VADDPD %XMM1,%XMM17,%XMM1 |
0x4087c9 VSHUFPD $0x1,%XMM1,%XMM1,%XMM17 |
0x4087d0 VADDSD %XMM17,%XMM1,%XMM17 |
0x4087d6 VEXTRACTF64X4 $0x1,%ZMM16,%YMM1 |
0x4087dd VADDPD %ZMM1,%ZMM16,%ZMM1 |
0x4087e3 VMOVAPD %XMM1,%XMM16 |
0x4087e9 VEXTRACTF128 $0x1,%YMM1,%XMM1 |
0x4087ef VADDPD %XMM1,%XMM16,%XMM1 |
0x4087f5 VSHUFPD $0x1,%XMM1,%XMM1,%XMM16 |
0x4087fc VADDSD %XMM16,%XMM1,%XMM16 |
0x408802 VEXTRACTF64X4 $0x1,%ZMM15,%YMM1 |
0x408809 VADDPD %ZMM1,%ZMM15,%ZMM1 |
0x40880f VMOVAPD %XMM1,%XMM15 |
0x408813 VEXTRACTF128 $0x1,%YMM1,%XMM1 |
0x408819 VADDPD %XMM1,%XMM15,%XMM1 |
0x40881d VSHUFPD $0x1,%XMM1,%XMM1,%XMM15 |
0x408822 VADDSD %XMM15,%XMM1,%XMM23 |
0x408828 CMP %EDI,0x60(%RSP) |
0x40882c JE 408480 |
0x408832 VPUNPCKLQDQ %XMM23,%XMM16,%XMM1 |
0x408838 VPUNPCKLQDQ %XMM17,%XMM18,%XMM15 |
0x40883e VINSERTF128 $0x1,%XMM1,%YMM15,%YMM1 |
0x408844 VPUNPCKLQDQ %XMM19,%XMM20,%XMM15 |
0x40884a VPUNPCKLQDQ %XMM21,%XMM22,%XMM16 |
0x408850 VINSERTF32X4 $0x1,%XMM15,%YMM16,%YMM15 |
0x408857 VINSERTF64X4 $0x1,%YMM1,%ZMM15,%ZMM15 |
0x40885e MOV %RDI,%RDX |
0x408861 JMP 408887 |
0x408880 VXORPD %XMM15,%XMM15,%XMM15 |
0x408885 XOR %EDX,%EDX |
0x408887 MOV 0x40(%RSP),%R15 |
0x40888c MOV %R15D,%EBX |
0x40888f IMUL %EDX,%EBX |
0x408892 MOV %RSI,0x30(%RSP) |
0x408897 ADD %ESI,%EBX |
0x408899 MOV 0x80(%RSP),%RSI |
0x4088a1 MOV 0x60(%RSP),%R10 |
0x4088a6 NOPW %CS:(%RAX,%RAX,1) |
(24) 0x4088b0 LEA (%R11,%RDX,1),%R14D |
(24) 0x4088b4 VMOVSS (%RSI,%R14,4),%XMM1 |
(24) 0x4088ba VCVTSS2SD %XMM1,%XMM1,%XMM1 |
(24) 0x4088be LEA 0x5(%RBX),%R14D |
(24) 0x4088c2 VPBROADCASTD %EBX,%XMM16 |
(24) 0x4088c8 VPADDD %YMM4,%YMM16,%YMM0 |
(24) 0x4088ce VPERMT2D %YMM16,%YMM5,%YMM0 |
(24) 0x4088d4 VPBROADCASTD %R14D,%YMM8 |
(24) 0x4088da VPBLENDD $0x20,%YMM8,%YMM0,%YMM0 |
(24) 0x4088e0 VPADDD %YMM6,%YMM16,%YMM8 |
(24) 0x4088e6 VPERMT2Q %YMM8,%YMM7,%YMM0 |
(24) 0x4088ec VPSUBD %YMM3,%YMM0,%YMM0 |
(24) 0x4088f0 VPXOR %XMM8,%XMM8,%XMM8 |
(24) 0x4088f5 KXNORW %K0,%K0,%K1 |
(24) 0x4088f9 VGATHERDPS (%R9,%YMM0,4),%YMM8{%K1} |
(24) 0x408900 VCVTPS2PD %YMM8,%ZMM0 |
(24) 0x408906 VBROADCASTSD %XMM1,%ZMM1 |
(24) 0x40890c VFMADD231PD %ZMM1,%ZMM0,%ZMM15 |
(24) 0x408912 INC %RDX |
(24) 0x408915 ADD %R15D,%EBX |
(24) 0x408918 CMP %RDX,%R10 |
(24) 0x40891b JNE 4088b0 |
0x40891d VEXTRACTF32X4 $0x3,%ZMM15,%XMM16 |
0x408924 VSHUFPD $0x1,%XMM16,%XMM16,%XMM23 |
0x40892b VEXTRACTF32X4 $0x2,%ZMM15,%XMM18 |
0x408932 VSHUFPD $0x1,%XMM18,%XMM18,%XMM17 |
0x408939 VMOVAPD %YMM15,%YMM1 |
0x40893d VEXTRACTF32X4 $0x1,%YMM15,%XMM20 |
0x408944 VSHUFPD $0x1,%XMM20,%XMM20,%XMM19 |
0x40894b VSHUFPD $0x1,%XMM1,%XMM1,%XMM21 |
0x408952 VMOVAPD %XMM15,%XMM22 |
0x408958 MOV 0x98(%RSP),%R14 |
0x408960 MOV 0x88(%RSP),%R15 |
0x408968 VPBROADCASTD 0x7a73(%RIP),%YMM0 |
0x408971 MOV 0x30(%RSP),%RSI |
0x408976 JMP 408488 |
/home/eoseret/llm-attention/attention.cpp: 27 - 33 |
-------------------------------------------------------------------------------- |
27: for (unsigned int j = 0; j < N; ++j) { //vectorized |
28: double sum = 0.0; |
29: #pragma clang loop vectorize(enable) |
30: for (unsigned int k = 0; k < K; ++k) { //vectorized |
31: sum += (double)A[i * K + k] * (double)B[k * N + j]; |
32: } |
33: C[i * N + j] = alpha * static_cast<float>(sum); |
| Coverage (%) | Name | Source Location | Module |
|---|
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.06 |
| CQA speedup if FP arith vectorized | 1.12 |
| CQA speedup if fully vectorized | 3.20 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.21 |
| Bottlenecks | |
| Function | main |
| Source | attention.cpp:27-27,attention.cpp:30-33 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 25.17 |
| CQA cycles if no scalar integer | 23.83 |
| CQA cycles if FP arith vectorized | 22.50 |
| CQA cycles if fully vectorized | 7.85 |
| Front-end cycles | 17.50 |
| P0 cycles | 13.17 |
| P1 cycles | 25.17 |
| P2 cycles | 2.78 |
| P3 cycles | 2.78 |
| P4 cycles | 4.33 |
| P5 cycles | 25.00 |
| P6 cycles | 4.33 |
| P7 cycles | 4.33 |
| P8 cycles | 4.33 |
| P9 cycles | 4.33 |
| P10 cycles | 4.33 |
| P11 cycles | 2.78 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | 0 |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 97.33 |
| Nb uops | 105.00 |
| Nb loads | 8.33 |
| Nb stores | 8.67 |
| Nb stack references | 7.00 |
| FLOP/cycle | 2.33 |
| Nb FLOP add-sub | 58.67 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 5.05 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 61.33 |
| Bytes stored | 37.33 |
| Stride 0 | 0.67 |
| Stride 1 | 0.00 |
| Stride n | 0.00 |
| Stride unknown | 1.33 |
| Stride indirect | 1.33 |
| Vectorization ratio all | 52.69 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | 65.33 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 67.14 |
| Vector-efficiency ratio all | 24.07 |
| Vector-efficiency ratio load | 11.63 |
| Vector-efficiency ratio store | 6.71 |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | 45.04 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 23.57 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.05 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 6.57 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.08 |
| Bottlenecks | P1, P5, |
| Function | main |
| Source | attention.cpp:27-27,attention.cpp:30-33 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 11.50 |
| CQA cycles if no scalar integer | 11.00 |
| CQA cycles if FP arith vectorized | 11.50 |
| CQA cycles if fully vectorized | 1.75 |
| Front-end cycles | 10.67 |
| P0 cycles | 8.00 |
| P1 cycles | 11.50 |
| P2 cycles | 3.00 |
| P3 cycles | 3.00 |
| P4 cycles | 4.50 |
| P5 cycles | 11.50 |
| P6 cycles | 4.00 |
| P7 cycles | 4.50 |
| P8 cycles | 4.50 |
| P9 cycles | 4.50 |
| P10 cycles | 4.00 |
| P11 cycles | 3.00 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | 0 |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 56.00 |
| Nb uops | 64.00 |
| Nb loads | 9.00 |
| Nb stores | 9.00 |
| Nb stack references | 8.00 |
| FLOP/cycle | 0.00 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 9.39 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 68.00 |
| Bytes stored | 40.00 |
| Stride 0 | 0.00 |
| Stride 1 | 0.00 |
| Stride n | 0.00 |
| Stride unknown | 1.00 |
| Stride indirect | 1.00 |
| Vectorization ratio all | 29.41 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | NA |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 47.62 |
| Vector-efficiency ratio all | 15.26 |
| Vector-efficiency ratio load | 11.46 |
| Vector-efficiency ratio store | 6.94 |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | NA |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 19.35 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.04 |
| CQA speedup if FP arith vectorized | 1.15 |
| CQA speedup if fully vectorized | 2.91 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.51 |
| Bottlenecks | P1, P5, |
| Function | main |
| Source | attention.cpp:27-27,attention.cpp:30-33 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 35.50 |
| CQA cycles if no scalar integer | 34.00 |
| CQA cycles if FP arith vectorized | 31.00 |
| CQA cycles if fully vectorized | 12.18 |
| Front-end cycles | 23.50 |
| P0 cycles | 16.00 |
| P1 cycles | 35.50 |
| P2 cycles | 3.67 |
| P3 cycles | 3.67 |
| P4 cycles | 4.50 |
| P5 cycles | 35.50 |
| P6 cycles | 5.00 |
| P7 cycles | 4.50 |
| P8 cycles | 4.50 |
| P9 cycles | 4.50 |
| P10 cycles | 5.00 |
| P11 cycles | 3.67 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | 0 |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 133.00 |
| Nb uops | 141.00 |
| Nb loads | 11.00 |
| Nb stores | 9.00 |
| Nb stack references | 8.00 |
| FLOP/cycle | 2.48 |
| Nb FLOP add-sub | 88.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 3.38 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 80.00 |
| Bytes stored | 40.00 |
| Stride 0 | 1.00 |
| Stride 1 | 0.00 |
| Stride n | 0.00 |
| Stride unknown | 2.00 |
| Stride indirect | 2.00 |
| Vectorization ratio all | 63.55 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | 66.67 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 75.36 |
| Vector-efficiency ratio all | 27.75 |
| Vector-efficiency ratio load | 10.94 |
| Vector-efficiency ratio store | 6.94 |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | 45.83 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 25.27 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.08 |
| CQA speedup if FP arith vectorized | 1.14 |
| CQA speedup if fully vectorized | 2.96 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.02 |
| Bottlenecks | P1, |
| Function | main |
| Source | attention.cpp:27-27,attention.cpp:30-33 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 28.50 |
| CQA cycles if no scalar integer | 26.50 |
| CQA cycles if FP arith vectorized | 25.00 |
| CQA cycles if fully vectorized | 9.63 |
| Front-end cycles | 18.33 |
| P0 cycles | 15.50 |
| P1 cycles | 28.50 |
| P2 cycles | 1.67 |
| P3 cycles | 1.67 |
| P4 cycles | 4.00 |
| P5 cycles | 28.00 |
| P6 cycles | 4.00 |
| P7 cycles | 4.00 |
| P8 cycles | 4.00 |
| P9 cycles | 4.00 |
| P10 cycles | 4.00 |
| P11 cycles | 1.67 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | 0 |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 103.00 |
| Nb uops | 110.00 |
| Nb loads | 5.00 |
| Nb stores | 8.00 |
| Nb stack references | 5.00 |
| FLOP/cycle | 3.09 |
| Nb FLOP add-sub | 88.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 2.39 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 36.00 |
| Bytes stored | 32.00 |
| Stride 0 | 1.00 |
| Stride 1 | 0.00 |
| Stride n | 0.00 |
| Stride unknown | 1.00 |
| Stride indirect | 1.00 |
| Vectorization ratio all | 65.12 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | 64.00 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 78.43 |
| Vector-efficiency ratio all | 29.22 |
| Vector-efficiency ratio load | 12.50 |
| Vector-efficiency ratio store | 6.25 |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | 44.25 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 26.10 |
| Path / |
| Function | main |
| Source file and lines | attention.cpp:27-33 |
| Module | attention-avx512 |
| nb instructions | 97.33 |
| nb uops | 105 |
| loop length | 571.67 |
| used x86 registers | 10.33 |
| used mmx registers | 0 |
| used xmm registers | 10 |
| used ymm registers | 3.33 |
| used zmm registers | 6.33 |
| nb stack references | 7 |
| micro-operation queue | 17.50 cycles |
| front end | 17.50 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 13.17 | 25.17 | 2.78 | 2.78 | 4.33 | 25.00 | 4.33 | 4.33 | 4.33 | 4.33 | 4.33 | 2.78 |
| cycles | 13.17 | 25.17 | 2.78 | 2.78 | 4.33 | 25.00 | 4.33 | 4.33 | 4.33 | 4.33 | 4.33 | 2.78 |
| Cycles executing div or sqrt instructions | NA |
| Longest recurrence chain latency (RecMII) | 0.00 |
| Front-end | 17.50 |
| Dispatch | 25.17 |
| Data deps. | 0.00 |
| Overall L1 | 25.17 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 0% |
| all | 60% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 66% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 75% |
| all | 52% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 65% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 67% |
| all | 11% |
| load | 11% |
| store | 12% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 6% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 10% |
| all | 25% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 45% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 25% |
| all | 24% |
| load | 11% |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 45% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 23% |
| Function | main |
| Source file and lines | attention.cpp:27-33 |
| Module | attention-avx512 |
| nb instructions | 56 |
| nb uops | 64 |
| loop length | 315 |
| used x86 registers | 11 |
| used mmx registers | 0 |
| used xmm registers | 10 |
| used ymm registers | 3 |
| used zmm registers | 1 |
| nb stack references | 8 |
| micro-operation queue | 10.67 cycles |
| front end | 10.67 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 8.00 | 11.50 | 3.00 | 3.00 | 4.50 | 11.50 | 4.00 | 4.50 | 4.50 | 4.50 | 4.00 | 3.00 |
| cycles | 8.00 | 11.50 | 3.00 | 3.00 | 4.50 | 11.50 | 4.00 | 4.50 | 4.50 | 4.50 | 4.00 | 3.00 |
| Cycles executing div or sqrt instructions | NA |
| Longest recurrence chain latency (RecMII) | 0.00 |
| Front-end | 10.67 |
| Dispatch | 11.50 |
| Data deps. | 0.00 |
| Overall L1 | 11.50 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 0% |
| all | 38% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 55% |
| all | 29% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 47% |
| all | 11% |
| load | 11% |
| store | 12% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 10% |
| all | 16% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 20% |
| all | 15% |
| load | 11% |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 19% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| VCVTSD2SS %XMM22,%XMM22,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA (%RAX,%R8,1),%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 | N/A |
| MOV 0x100(%RSP),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| VMOVSS %XMM1,(%RBX,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VCVTSD2SS %XMM21,%XMM21,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x1(%RAX,%R8,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM1,(%RBX,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VCVTSD2SS %XMM20,%XMM20,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x2(%RAX,%R8,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM1,(%RBX,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VCVTSD2SS %XMM19,%XMM19,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x3(%RAX,%R8,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM1,(%RBX,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VCVTSD2SS %XMM18,%XMM18,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x4(%RAX,%R8,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM1,(%RBX,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VCVTSD2SS %XMM17,%XMM17,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x5(%RAX,%R8,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM1,(%RBX,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VCVTSD2SS %XMM16,%XMM16,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x6(%RAX,%R8,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM1,(%RBX,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VCVTSD2SS %XMM23,%XMM23,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x7(%RAX,%R8,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM1,(%RBX,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| ADD $0x8,%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 | N/A |
| CMP 0x180(%RSP),%RCX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 | scal (12.5%) |
| LEA 0x1(%RCX),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
| JE 4082ab <main+0x392b> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| LEA (,%RCX,8),%R8D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 | N/A |
| TEST %RDI,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 | scal (12.5%) |
| JE 408880 <main+0x3f00> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| VXORPD %XMM15,%XMM15,%XMM15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
| MOV 0x40(%RSP),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
| MOV %R15D,%EBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| IMUL %EDX,%EBX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| MOV %RSI,0x30(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| ADD %ESI,%EBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 | N/A |
| MOV 0x80(%RSP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| MOV 0x60(%RSP),%R10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
| VEXTRACTF32X4 $0x3,%ZMM15,%XMM16 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VSHUFPD $0x1,%XMM16,%XMM16,%XMM23 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VEXTRACTF32X4 $0x2,%ZMM15,%XMM18 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VSHUFPD $0x1,%XMM18,%XMM18,%XMM17 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VMOVAPD %YMM15,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 | vect (50.0%) |
| VEXTRACTF32X4 $0x1,%YMM15,%XMM20 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VSHUFPD $0x1,%XMM20,%XMM20,%XMM19 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM1,%XMM1,%XMM21 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VMOVAPD %XMM15,%XMM22 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 | vect (25.0%) |
| MOV 0x98(%RSP),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
| MOV 0x88(%RSP),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
| VPBROADCASTD 0x7a73(%RIP),%YMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 3 | 0.33 | scal (6.3%) |
| MOV 0x30(%RSP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| JMP 408488 <main+0x3b08> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 | N/A |
| Function | main |
| Source file and lines | attention.cpp:27-33 |
| Module | attention-avx512 |
| nb instructions | 133 |
| nb uops | 141 |
| loop length | 788 |
| used x86 registers | 11 |
| used mmx registers | 0 |
| used xmm registers | 10 |
| used ymm registers | 5 |
| used zmm registers | 9 |
| nb stack references | 8 |
| micro-operation queue | 23.50 cycles |
| front end | 23.50 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 16.00 | 35.50 | 3.67 | 3.67 | 4.50 | 35.50 | 5.00 | 4.50 | 4.50 | 4.50 | 5.00 | 3.67 |
| cycles | 16.00 | 35.50 | 3.67 | 3.67 | 4.50 | 35.50 | 5.00 | 4.50 | 4.50 | 4.50 | 5.00 | 3.67 |
| Cycles executing div or sqrt instructions | NA |
| Longest recurrence chain latency (RecMII) | 0.00 |
| Front-end | 23.50 |
| Dispatch | 35.50 |
| Data deps. | 0.00 |
| Overall L1 | 35.50 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 0% |
| all | 73% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 66% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 86% |
| all | 63% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 66% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 75% |
| all | 11% |
| load | 10% |
| store | 12% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 10% |
| all | 30% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 45% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 27% |
| all | 27% |
| load | 10% |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 45% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 25% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| VCVTSD2SS %XMM22,%XMM22,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA (%RAX,%R8,1),%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 | N/A |
| MOV 0x100(%RSP),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| VMOVSS %XMM1,(%RBX,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VCVTSD2SS %XMM21,%XMM21,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x1(%RAX,%R8,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM1,(%RBX,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VCVTSD2SS %XMM20,%XMM20,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x2(%RAX,%R8,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM1,(%RBX,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VCVTSD2SS %XMM19,%XMM19,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x3(%RAX,%R8,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM1,(%RBX,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VCVTSD2SS %XMM18,%XMM18,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x4(%RAX,%R8,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM1,(%RBX,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VCVTSD2SS %XMM17,%XMM17,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x5(%RAX,%R8,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM1,(%RBX,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VCVTSD2SS %XMM16,%XMM16,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x6(%RAX,%R8,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM1,(%RBX,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VCVTSD2SS %XMM23,%XMM23,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x7(%RAX,%R8,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM1,(%RBX,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| ADD $0x8,%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 | N/A |
| CMP 0x180(%RSP),%RCX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 | scal (12.5%) |
| LEA 0x1(%RCX),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
| JE 4082ab <main+0x392b> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| LEA (,%RCX,8),%R8D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 | N/A |
| TEST %RDI,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 | scal (12.5%) |
| JE 408880 <main+0x3f00> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| VPBROADCASTD %R8D,%YMM22 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (6.3%) |
| VXORPD %XMM15,%XMM15,%XMM15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| VXORPD %XMM16,%XMM16,%XMM16 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| VXORPD %XMM17,%XMM17,%XMM17 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| VXORPD %XMM18,%XMM18,%XMM18 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| VXORPD %XMM19,%XMM19,%XMM19 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| VXORPD %XMM20,%XMM20,%XMM20 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| VXORPD %XMM21,%XMM21,%XMM21 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| VXORPS %XMM23,%XMM23,%XMM23 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
| MOV 0x80(%RSP),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
| VEXTRACTF64X4 $0x1,%ZMM23,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (50.0%) |
| VADDPD %ZMM1,%ZMM23,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (100.0%) |
| VMOVAPD %XMM1,%XMM22 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 | vect (25.0%) |
| VEXTRACTF128 $0x1,%YMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPD %XMM1,%XMM22,%XMM1 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM1,%XMM1,%XMM22 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VADDSD %XMM22,%XMM1,%XMM22 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| VEXTRACTF64X4 $0x1,%ZMM21,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (50.0%) |
| VADDPD %ZMM1,%ZMM21,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (100.0%) |
| VMOVAPD %XMM1,%XMM21 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 | vect (25.0%) |
| VEXTRACTF128 $0x1,%YMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPD %XMM1,%XMM21,%XMM1 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM1,%XMM1,%XMM21 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VADDSD %XMM21,%XMM1,%XMM21 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| VEXTRACTF64X4 $0x1,%ZMM20,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (50.0%) |
| VADDPD %ZMM1,%ZMM20,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (100.0%) |
| VMOVAPD %XMM1,%XMM20 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 | vect (25.0%) |
| VEXTRACTF128 $0x1,%YMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPD %XMM1,%XMM20,%XMM1 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM1,%XMM1,%XMM20 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VADDSD %XMM20,%XMM1,%XMM20 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| VEXTRACTF64X4 $0x1,%ZMM19,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (50.0%) |
| VADDPD %ZMM1,%ZMM19,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (100.0%) |
| VMOVAPD %XMM1,%XMM19 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 | vect (25.0%) |
| VEXTRACTF128 $0x1,%YMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPD %XMM1,%XMM19,%XMM1 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM1,%XMM1,%XMM19 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VADDSD %XMM19,%XMM1,%XMM19 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| VEXTRACTF64X4 $0x1,%ZMM18,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (50.0%) |
| VADDPD %ZMM1,%ZMM18,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (100.0%) |
| VMOVAPD %XMM1,%XMM18 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 | vect (25.0%) |
| VEXTRACTF128 $0x1,%YMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPD %XMM1,%XMM18,%XMM1 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM1,%XMM1,%XMM18 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VADDSD %XMM18,%XMM1,%XMM18 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| VEXTRACTF64X4 $0x1,%ZMM17,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (50.0%) |
| VADDPD %ZMM1,%ZMM17,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (100.0%) |
| VMOVAPD %XMM1,%XMM17 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 | vect (25.0%) |
| VEXTRACTF128 $0x1,%YMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPD %XMM1,%XMM17,%XMM1 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM1,%XMM1,%XMM17 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VADDSD %XMM17,%XMM1,%XMM17 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| VEXTRACTF64X4 $0x1,%ZMM16,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (50.0%) |
| VADDPD %ZMM1,%ZMM16,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (100.0%) |
| VMOVAPD %XMM1,%XMM16 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 | vect (25.0%) |
| VEXTRACTF128 $0x1,%YMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPD %XMM1,%XMM16,%XMM1 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM1,%XMM1,%XMM16 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VADDSD %XMM16,%XMM1,%XMM16 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| VEXTRACTF64X4 $0x1,%ZMM15,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (50.0%) |
| VADDPD %ZMM1,%ZMM15,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (100.0%) |
| VMOVAPD %XMM1,%XMM15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 | vect (25.0%) |
| VEXTRACTF128 $0x1,%YMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPD %XMM1,%XMM15,%XMM1 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM1,%XMM1,%XMM15 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VADDSD %XMM15,%XMM1,%XMM23 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| CMP %EDI,0x60(%RSP) | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 | scal (6.3%) |
| JE 408480 <main+0x3b00> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| VPUNPCKLQDQ %XMM23,%XMM16,%XMM1 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| VPUNPCKLQDQ %XMM17,%XMM18,%XMM15 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| VINSERTF128 $0x1,%XMM1,%YMM15,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VPUNPCKLQDQ %XMM19,%XMM20,%XMM15 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| VPUNPCKLQDQ %XMM21,%XMM22,%XMM16 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| VINSERTF32X4 $0x1,%XMM15,%YMM16,%YMM15 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VINSERTF64X4 $0x1,%YMM1,%ZMM15,%ZMM15 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (50.0%) |
| MOV %RDI,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| JMP 408887 <main+0x3f07> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 | N/A |
| MOV 0x40(%RSP),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
| MOV %R15D,%EBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| IMUL %EDX,%EBX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| MOV %RSI,0x30(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| ADD %ESI,%EBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 | N/A |
| MOV 0x80(%RSP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| MOV 0x60(%RSP),%R10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
| VEXTRACTF32X4 $0x3,%ZMM15,%XMM16 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VSHUFPD $0x1,%XMM16,%XMM16,%XMM23 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VEXTRACTF32X4 $0x2,%ZMM15,%XMM18 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VSHUFPD $0x1,%XMM18,%XMM18,%XMM17 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VMOVAPD %YMM15,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 | vect (50.0%) |
| VEXTRACTF32X4 $0x1,%YMM15,%XMM20 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VSHUFPD $0x1,%XMM20,%XMM20,%XMM19 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM1,%XMM1,%XMM21 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VMOVAPD %XMM15,%XMM22 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 | vect (25.0%) |
| MOV 0x98(%RSP),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
| MOV 0x88(%RSP),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
| VPBROADCASTD 0x7a73(%RIP),%YMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 3 | 0.33 | scal (6.3%) |
| MOV 0x30(%RSP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| JMP 408488 <main+0x3b08> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 | N/A |
| Function | main |
| Source file and lines | attention.cpp:27-33 |
| Module | attention-avx512 |
| nb instructions | 103 |
| nb uops | 110 |
| loop length | 612 |
| used x86 registers | 9 |
| used mmx registers | 0 |
| used xmm registers | 10 |
| used ymm registers | 2 |
| used zmm registers | 9 |
| nb stack references | 5 |
| micro-operation queue | 18.33 cycles |
| front end | 18.33 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 15.50 | 28.50 | 1.67 | 1.67 | 4.00 | 28.00 | 4.00 | 4.00 | 4.00 | 4.00 | 4.00 | 1.67 |
| cycles | 15.50 | 28.50 | 1.67 | 1.67 | 4.00 | 28.00 | 4.00 | 4.00 | 4.00 | 4.00 | 4.00 | 1.67 |
| Cycles executing div or sqrt instructions | NA |
| Longest recurrence chain latency (RecMII) | 0.00 |
| Front-end | 18.33 |
| Dispatch | 28.50 |
| Data deps. | 0.00 |
| Overall L1 | 28.50 |
| all | 0% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 0% |
| all | 70% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 66% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 83% |
| all | 65% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 64% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 78% |
| all | 10% |
| load | 12% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 6% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 10% |
| all | 30% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 45% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 27% |
| all | 29% |
| load | 12% |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 44% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 26% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOV 0x88(%RSP),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
| VCVTSD2SS %XMM22,%XMM22,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA (%RAX,%R8,1),%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 | N/A |
| MOV 0x100(%RSP),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| VMOVSS %XMM1,(%RBX,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VCVTSD2SS %XMM21,%XMM21,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x1(%RAX,%R8,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM1,(%RBX,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VCVTSD2SS %XMM20,%XMM20,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x2(%RAX,%R8,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM1,(%RBX,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VCVTSD2SS %XMM19,%XMM19,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x3(%RAX,%R8,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM1,(%RBX,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VCVTSD2SS %XMM18,%XMM18,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x4(%RAX,%R8,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM1,(%RBX,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VCVTSD2SS %XMM17,%XMM17,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x5(%RAX,%R8,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM1,(%RBX,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VCVTSD2SS %XMM16,%XMM16,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x6(%RAX,%R8,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM1,(%RBX,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VCVTSD2SS %XMM23,%XMM23,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x7(%RAX,%R8,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM1,(%RBX,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| ADD $0x8,%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 | scal (6.3%) |
| CMP 0x180(%RSP),%RCX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 | scal (12.5%) |
| LEA 0x1(%RCX),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
| JE 4082ab <main+0x392b> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| LEA (,%RCX,8),%R8D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 | N/A |
| TEST %RDI,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 | scal (12.5%) |
| JE 408880 <main+0x3f00> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| VPBROADCASTD %R8D,%YMM22 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (6.3%) |
| VXORPD %XMM15,%XMM15,%XMM15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| VXORPD %XMM16,%XMM16,%XMM16 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| VXORPD %XMM17,%XMM17,%XMM17 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| VXORPD %XMM18,%XMM18,%XMM18 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| VXORPD %XMM19,%XMM19,%XMM19 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| VXORPD %XMM20,%XMM20,%XMM20 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| VXORPD %XMM21,%XMM21,%XMM21 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| VXORPS %XMM23,%XMM23,%XMM23 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
| MOV 0x80(%RSP),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
| VEXTRACTF64X4 $0x1,%ZMM23,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (50.0%) |
| VADDPD %ZMM1,%ZMM23,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (100.0%) |
| VMOVAPD %XMM1,%XMM22 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 | vect (25.0%) |
| VEXTRACTF128 $0x1,%YMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPD %XMM1,%XMM22,%XMM1 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM1,%XMM1,%XMM22 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VADDSD %XMM22,%XMM1,%XMM22 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| VEXTRACTF64X4 $0x1,%ZMM21,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (50.0%) |
| VADDPD %ZMM1,%ZMM21,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (100.0%) |
| VMOVAPD %XMM1,%XMM21 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 | vect (25.0%) |
| VEXTRACTF128 $0x1,%YMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPD %XMM1,%XMM21,%XMM1 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM1,%XMM1,%XMM21 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VADDSD %XMM21,%XMM1,%XMM21 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| VEXTRACTF64X4 $0x1,%ZMM20,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (50.0%) |
| VADDPD %ZMM1,%ZMM20,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (100.0%) |
| VMOVAPD %XMM1,%XMM20 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 | vect (25.0%) |
| VEXTRACTF128 $0x1,%YMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPD %XMM1,%XMM20,%XMM1 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM1,%XMM1,%XMM20 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VADDSD %XMM20,%XMM1,%XMM20 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| VEXTRACTF64X4 $0x1,%ZMM19,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (50.0%) |
| VADDPD %ZMM1,%ZMM19,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (100.0%) |
| VMOVAPD %XMM1,%XMM19 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 | vect (25.0%) |
| VEXTRACTF128 $0x1,%YMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPD %XMM1,%XMM19,%XMM1 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM1,%XMM1,%XMM19 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VADDSD %XMM19,%XMM1,%XMM19 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| VEXTRACTF64X4 $0x1,%ZMM18,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (50.0%) |
| VADDPD %ZMM1,%ZMM18,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (100.0%) |
| VMOVAPD %XMM1,%XMM18 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 | vect (25.0%) |
| VEXTRACTF128 $0x1,%YMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPD %XMM1,%XMM18,%XMM1 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM1,%XMM1,%XMM18 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VADDSD %XMM18,%XMM1,%XMM18 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| VEXTRACTF64X4 $0x1,%ZMM17,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (50.0%) |
| VADDPD %ZMM1,%ZMM17,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (100.0%) |
| VMOVAPD %XMM1,%XMM17 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 | vect (25.0%) |
| VEXTRACTF128 $0x1,%YMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPD %XMM1,%XMM17,%XMM1 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM1,%XMM1,%XMM17 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VADDSD %XMM17,%XMM1,%XMM17 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| VEXTRACTF64X4 $0x1,%ZMM16,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (50.0%) |
| VADDPD %ZMM1,%ZMM16,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (100.0%) |
| VMOVAPD %XMM1,%XMM16 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 | vect (25.0%) |
| VEXTRACTF128 $0x1,%YMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPD %XMM1,%XMM16,%XMM1 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM1,%XMM1,%XMM16 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VADDSD %XMM16,%XMM1,%XMM16 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| VEXTRACTF64X4 $0x1,%ZMM15,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (50.0%) |
| VADDPD %ZMM1,%ZMM15,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (100.0%) |
| VMOVAPD %XMM1,%XMM15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 | vect (25.0%) |
| VEXTRACTF128 $0x1,%YMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPD %XMM1,%XMM15,%XMM1 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM1,%XMM1,%XMM15 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VADDSD %XMM15,%XMM1,%XMM23 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| CMP %EDI,0x60(%RSP) | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 | N/A |
| JE 408480 <main+0x3b00> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
