| Loop Id: 32 | Module: attention-avx512 | Source: attention.cpp:26-262 [...] | Coverage: 0.21% |
|---|
| Loop Id: 32 | Module: attention-avx512 | Source: attention.cpp:26-262 [...] | Coverage: 0.21% |
|---|
0x406480 MOV -0x110(%RBP),%RAX |
0x406487 CMP -0x158(%RBP),%RAX |
0x40648e LEA 0x1(%RAX),%RAX |
0x406492 JE 406bc0 |
0x406498 MOV %RAX,-0x110(%RBP) |
0x40649f MOV -0xb8(%RBP),%RAX |
0x4064a6 ADD %ECX,%EAX |
0x4064a8 MOV %RAX,-0x98(%RBP) |
0x4064af MOVL $0,-0x40(%RBP) |
0x4064b6 MOVQ $0,-0x78(%RBP) |
0x4064be JMP 4064da |
0x4064c0 MOV -0x48(%RBP),%RAX |
0x4064c4 ADD %EAX,-0x40(%RBP) |
0x4064c7 MOV -0x78(%RBP),%RAX |
0x4064cb CMP -0xc8(%RBP),%EAX |
0x4064d1 LEA 0x1(%RAX),%EAX |
0x4064d4 MOV %RAX,-0x78(%RBP) |
0x4064d8 JE 406480 |
0x4064da CMPQ $0,-0x120(%RBP) |
0x4064e2 JE 4066d4 |
0x4064e8 MOV -0x78(%RBP),%RCX |
0x4064ec IMUL -0xb8(%RBP),%ECX |
0x4064f3 MOV $0x7,%R10D |
0x4064f9 XOR %EDX,%EDX |
0x4064fb MOV -0xc0(%RBP),%R13 |
0x406502 MOV -0xd0(%RBP),%R15 |
0x406509 MOV -0xb8(%RBP),%R8 |
(36) 0x406510 LEA (,%RDX,8),%ESI |
(36) 0x406517 VXORPD %XMM16,%XMM16,%XMM16 |
(36) 0x40651d MOV -0x40(%RBP),%R9D |
(36) 0x406521 MOV %R10D,%EAX |
(36) 0x406524 MOV -0x70(%RBP),%RDI |
(36) 0x406528 VXORPD %XMM15,%XMM15,%XMM15 |
(36) 0x40652d VXORPS %XMM14,%XMM14,%XMM14 |
(36) 0x406532 VXORPS %XMM13,%XMM13,%XMM13 |
(36) 0x406537 VXORPS %XMM12,%XMM12,%XMM12 |
(36) 0x40653c VXORPS %XMM11,%XMM11,%XMM11 |
(36) 0x406541 VXORPS %XMM10,%XMM10,%XMM10 |
(36) 0x406546 VXORPS %XMM9,%XMM9,%XMM9 |
(36) 0x40654b NOPL (%RAX,%RAX,1) |
(35) 0x406550 MOV %R9D,%R14D |
(35) 0x406553 VMOVSS (%R13,%R14,4),%XMM17 |
(35) 0x40655b VCVTSS2SD %XMM17,%XMM17,%XMM17 |
(35) 0x406561 LEA -0x7(%RAX),%R14D |
(35) 0x406565 VMOVSS (%R15,%R14,4),%XMM18 |
(35) 0x40656c VCVTSS2SD %XMM18,%XMM18,%XMM18 |
(35) 0x406572 VFMADD231SD %XMM18,%XMM17,%XMM16 |
(35) 0x406578 LEA -0x6(%RAX),%R14D |
(35) 0x40657c VMOVSS (%R15,%R14,4),%XMM18 |
(35) 0x406583 VCVTSS2SD %XMM18,%XMM18,%XMM18 |
(35) 0x406589 VFMADD231SD %XMM18,%XMM17,%XMM15 |
(35) 0x40658f LEA -0x5(%RAX),%R14D |
(35) 0x406593 VMOVSS (%R15,%R14,4),%XMM18 |
(35) 0x40659a VCVTSS2SD %XMM18,%XMM18,%XMM18 |
(35) 0x4065a0 LEA -0x4(%RAX),%R14D |
(35) 0x4065a4 VMOVSS (%R15,%R14,4),%XMM19 |
(35) 0x4065ab VCVTSS2SD %XMM19,%XMM19,%XMM19 |
(35) 0x4065b1 VFMADD231SD %XMM18,%XMM17,%XMM14 |
(35) 0x4065b7 VFMADD231SD %XMM19,%XMM17,%XMM13 |
(35) 0x4065bd LEA -0x3(%RAX),%R14D |
(35) 0x4065c1 VMOVSS (%R15,%R14,4),%XMM18 |
(35) 0x4065c8 VCVTSS2SD %XMM18,%XMM18,%XMM18 |
(35) 0x4065ce VFMADD231SD %XMM18,%XMM17,%XMM12 |
(35) 0x4065d4 LEA -0x2(%RAX),%R14D |
(35) 0x4065d8 VMOVSS (%R15,%R14,4),%XMM18 |
(35) 0x4065df VCVTSS2SD %XMM18,%XMM18,%XMM18 |
(35) 0x4065e5 VFMADD231SD %XMM18,%XMM17,%XMM11 |
(35) 0x4065eb LEA -0x1(%RAX),%R14D |
(35) 0x4065ef VMOVSS (%R15,%R14,4),%XMM18 |
(35) 0x4065f6 VCVTSS2SD %XMM18,%XMM18,%XMM18 |
(35) 0x4065fc VFMADD231SD %XMM18,%XMM17,%XMM10 |
(35) 0x406602 MOV %EAX,%R14D |
(35) 0x406605 VMOVSS (%R15,%R14,4),%XMM18 |
(35) 0x40660c VCVTSS2SD %XMM18,%XMM18,%XMM18 |
(35) 0x406612 VFMADD231SD %XMM18,%XMM17,%XMM9 |
(35) 0x406618 ADD %R8D,%EAX |
(35) 0x40661b INC %R9D |
(35) 0x40661e DEC %RDI |
(35) 0x406621 JNE 406550 |
(36) 0x406627 VCVTSD2SS %XMM16,%XMM16,%XMM16 |
(36) 0x40662d VMULSS %XMM1,%XMM16,%XMM16 |
(36) 0x406633 LEA (%RCX,%RSI,1),%EAX |
(36) 0x406636 MOV -0x100(%RBP),%RDI |
(36) 0x40663d VMOVSS %XMM16,(%RDI,%RAX,4) |
(36) 0x406644 VCVTSD2SS %XMM15,%XMM15,%XMM15 |
(36) 0x406649 VMULSS %XMM1,%XMM15,%XMM15 |
(36) 0x40664d LEA 0x1(%RCX,%RSI,1),%EAX |
(36) 0x406651 VMOVSS %XMM15,(%RDI,%RAX,4) |
(36) 0x406656 VCVTSD2SS %XMM14,%XMM14,%XMM14 |
(36) 0x40665b VMULSS %XMM1,%XMM14,%XMM14 |
(36) 0x40665f LEA 0x2(%RCX,%RSI,1),%EAX |
(36) 0x406663 VMOVSS %XMM14,(%RDI,%RAX,4) |
(36) 0x406668 VCVTSD2SS %XMM13,%XMM13,%XMM13 |
(36) 0x40666d VMULSS %XMM1,%XMM13,%XMM13 |
(36) 0x406671 LEA 0x3(%RCX,%RSI,1),%EAX |
(36) 0x406675 VMOVSS %XMM13,(%RDI,%RAX,4) |
(36) 0x40667a VCVTSD2SS %XMM12,%XMM12,%XMM12 |
(36) 0x40667f VMULSS %XMM1,%XMM12,%XMM12 |
(36) 0x406683 LEA 0x4(%RCX,%RSI,1),%EAX |
(36) 0x406687 VMOVSS %XMM12,(%RDI,%RAX,4) |
(36) 0x40668c VCVTSD2SS %XMM11,%XMM11,%XMM11 |
(36) 0x406691 VMULSS %XMM1,%XMM11,%XMM11 |
(36) 0x406695 LEA 0x5(%RCX,%RSI,1),%EAX |
(36) 0x406699 VMOVSS %XMM11,(%RDI,%RAX,4) |
(36) 0x40669e VCVTSD2SS %XMM10,%XMM10,%XMM10 |
(36) 0x4066a3 VMULSS %XMM1,%XMM10,%XMM10 |
(36) 0x4066a7 LEA 0x6(%RCX,%RSI,1),%EAX |
(36) 0x4066ab VMOVSS %XMM10,(%RDI,%RAX,4) |
(36) 0x4066b0 VCVTSD2SS %XMM9,%XMM9,%XMM9 |
(36) 0x4066b5 VMULSS %XMM1,%XMM9,%XMM9 |
(36) 0x4066b9 LEA 0x7(%RCX,%RSI,1),%EAX |
(36) 0x4066bd VMOVSS %XMM9,(%RDI,%RAX,4) |
(36) 0x4066c2 ADD $0x8,%R10D |
(36) 0x4066c6 CMP -0x60(%RBP),%RDX |
(36) 0x4066ca LEA 0x1(%RDX),%RDX |
(36) 0x4066ce JNE 406510 |
0x4066d4 MOV -0x148(%RBP),%R15 |
0x4066db CMP %R15D,-0xf0(%RBP) |
0x4066e2 MOV -0x160(%RBP),%R13 |
0x4066e9 MOV -0x30(%RBP),%R14 |
0x4066ed MOV -0x38(%RBP),%RCX |
0x4066f1 JAE 4064c0 |
/home/eoseret/llm-attention/attention.cpp: 26 - 262 |
-------------------------------------------------------------------------------- |
26: for (unsigned int i = 0; i < M; ++i) { |
27: for (unsigned int j = 0; j < N; ++j) { //vectorized |
28: double sum = 0.0; |
29: #pragma clang loop vectorize(enable) |
30: for (unsigned int k = 0; k < K; ++k) { //vectorized |
31: sum += (double)A[i * K + k] * (double)B[k * N + j]; |
32: } |
33: C[i * N + j] = alpha * static_cast<float>(sum); |
[...] |
262: for (size_t r = 0; r < rept; r++) |
| Coverage (%) | Name | Source Location | Module |
|---|
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.00 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 11.08 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.06 |
| Bottlenecks | P2, P3, P11, |
| Function | main |
| Source | attention.cpp:26-27,attention.cpp:31-33,attention.cpp:262-262 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 6.00 |
| CQA cycles if no scalar integer | 6.00 |
| CQA cycles if FP arith vectorized | 6.00 |
| CQA cycles if fully vectorized | 0.54 |
| Front-end cycles | 5.67 |
| P0 cycles | 2.60 |
| P1 cycles | 2.60 |
| P2 cycles | 6.00 |
| P3 cycles | 6.00 |
| P4 cycles | 3.00 |
| P5 cycles | 2.60 |
| P6 cycles | 2.60 |
| P7 cycles | 3.00 |
| P8 cycles | 3.00 |
| P9 cycles | 3.00 |
| P10 cycles | 2.60 |
| P11 cycles | 6.00 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 33.00 |
| Nb uops | 34.00 |
| Nb loads | 18.00 |
| Nb stores | 6.00 |
| Nb stack references | 16.00 |
| FLOP/cycle | 0.00 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 28.00 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 128.00 |
| Bytes stored | 40.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 0.00 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | NA |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 0.00 |
| Vector-efficiency ratio all | 10.07 |
| Vector-efficiency ratio load | 10.80 |
| Vector-efficiency ratio store | 9.38 |
| Vector-efficiency ratio mul | 6.25 |
| Vector-efficiency ratio add_sub | NA |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 8.75 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.00 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 11.08 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.06 |
| Bottlenecks | P2, P3, P11, |
| Function | main |
| Source | attention.cpp:26-27,attention.cpp:31-33,attention.cpp:262-262 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 6.00 |
| CQA cycles if no scalar integer | 6.00 |
| CQA cycles if FP arith vectorized | 6.00 |
| CQA cycles if fully vectorized | 0.54 |
| Front-end cycles | 5.67 |
| P0 cycles | 2.60 |
| P1 cycles | 2.60 |
| P2 cycles | 6.00 |
| P3 cycles | 6.00 |
| P4 cycles | 3.00 |
| P5 cycles | 2.60 |
| P6 cycles | 2.60 |
| P7 cycles | 3.00 |
| P8 cycles | 3.00 |
| P9 cycles | 3.00 |
| P10 cycles | 2.60 |
| P11 cycles | 6.00 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 33.00 |
| Nb uops | 34.00 |
| Nb loads | 18.00 |
| Nb stores | 6.00 |
| Nb stack references | 16.00 |
| FLOP/cycle | 0.00 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 28.00 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 128.00 |
| Bytes stored | 40.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 0.00 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | NA |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 0.00 |
| Vector-efficiency ratio all | 10.07 |
| Vector-efficiency ratio load | 10.80 |
| Vector-efficiency ratio store | 9.38 |
| Vector-efficiency ratio mul | 6.25 |
| Vector-efficiency ratio add_sub | NA |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 8.75 |
| Path / |
| Function | main |
| Source file and lines | attention.cpp:26-262 |
| Module | attention-avx512 |
| nb instructions | 33 |
| nb uops | 34 |
| loop length | 179 |
| used x86 registers | 9 |
| used mmx registers | 0 |
| used xmm registers | 0 |
| used ymm registers | 0 |
| used zmm registers | 0 |
| nb stack references | 16 |
| micro-operation queue | 5.67 cycles |
| front end | 5.67 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 2.60 | 2.60 | 6.00 | 6.00 | 3.00 | 2.60 | 2.60 | 3.00 | 3.00 | 3.00 | 2.60 | 6.00 |
| cycles | 2.60 | 2.60 | 6.00 | 6.00 | 3.00 | 2.60 | 2.60 | 3.00 | 3.00 | 3.00 | 2.60 | 6.00 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 5.67 |
| Dispatch | 6.00 |
| Overall L1 | 6.00 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 10% |
| load | 10% |
| store | 9% |
| mul | 6% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 8% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOV -0x110(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| CMP -0x158(%RBP),%RAX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 | scal (12.5%) |
| LEA 0x1(%RAX),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
| JE 406bc0 <main+0x2dc0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| MOV %RAX,-0x110(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| MOV -0xb8(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| ADD %ECX,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 | N/A |
| MOV %RAX,-0x98(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| MOVL $0,-0x40(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| MOVQ $0,-0x78(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| JMP 4064da <main+0x26da> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 | N/A |
| MOV -0x48(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| ADD %EAX,-0x40(%RBP) | 2 | 0.20 | 0.20 | 0.33 | 0.33 | 0.50 | 0.20 | 0.20 | 0.50 | 0.50 | 0.50 | 0.20 | 0.33 | 1 | 0.50 | scal (6.3%) |
| MOV -0x78(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| CMP -0xc8(%RBP),%EAX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 | scal (6.3%) |
| LEA 0x1(%RAX),%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 | N/A |
| MOV %RAX,-0x78(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| JE 406480 <main+0x2680> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| CMPQ $0,-0x120(%RBP) | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 | scal (12.5%) |
| JE 4066d4 <main+0x28d4> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| MOV -0x78(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
| IMUL -0xb8(%RBP),%ECX | 1 | 0 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 3 | 1 | scal (6.3%) |
| MOV $0x7,%R10D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 | scal (6.3%) |
| XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | scal (6.3%) |
| MOV -0xc0(%RBP),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
| MOV -0xd0(%RBP),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| MOV -0xb8(%RBP),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
| MOV -0x148(%RBP),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| CMP %R15D,-0xf0(%RBP) | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 | N/A |
| MOV -0x160(%RBP),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
| MOV -0x30(%RBP),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
| MOV -0x38(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
| JAE 4064c0 <main+0x26c0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| Function | main |
| Source file and lines | attention.cpp:26-262 |
| Module | attention-avx512 |
| nb instructions | 33 |
| nb uops | 34 |
| loop length | 179 |
| used x86 registers | 9 |
| used mmx registers | 0 |
| used xmm registers | 0 |
| used ymm registers | 0 |
| used zmm registers | 0 |
| nb stack references | 16 |
| micro-operation queue | 5.67 cycles |
| front end | 5.67 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 2.60 | 2.60 | 6.00 | 6.00 | 3.00 | 2.60 | 2.60 | 3.00 | 3.00 | 3.00 | 2.60 | 6.00 |
| cycles | 2.60 | 2.60 | 6.00 | 6.00 | 3.00 | 2.60 | 2.60 | 3.00 | 3.00 | 3.00 | 2.60 | 6.00 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 5.67 |
| Dispatch | 6.00 |
| Overall L1 | 6.00 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 10% |
| load | 10% |
| store | 9% |
| mul | 6% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 8% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOV -0x110(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| CMP -0x158(%RBP),%RAX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 | scal (12.5%) |
| LEA 0x1(%RAX),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
| JE 406bc0 <main+0x2dc0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| MOV %RAX,-0x110(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| MOV -0xb8(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| ADD %ECX,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 | N/A |
| MOV %RAX,-0x98(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| MOVL $0,-0x40(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| MOVQ $0,-0x78(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| JMP 4064da <main+0x26da> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 | N/A |
| MOV -0x48(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| ADD %EAX,-0x40(%RBP) | 2 | 0.20 | 0.20 | 0.33 | 0.33 | 0.50 | 0.20 | 0.20 | 0.50 | 0.50 | 0.50 | 0.20 | 0.33 | 1 | 0.50 | scal (6.3%) |
| MOV -0x78(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| CMP -0xc8(%RBP),%EAX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 | scal (6.3%) |
| LEA 0x1(%RAX),%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 | N/A |
| MOV %RAX,-0x78(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| JE 406480 <main+0x2680> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| CMPQ $0,-0x120(%RBP) | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 | scal (12.5%) |
| JE 4066d4 <main+0x28d4> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| MOV -0x78(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
| IMUL -0xb8(%RBP),%ECX | 1 | 0 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 3 | 1 | scal (6.3%) |
| MOV $0x7,%R10D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 | scal (6.3%) |
| XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | scal (6.3%) |
| MOV -0xc0(%RBP),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
| MOV -0xd0(%RBP),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| MOV -0xb8(%RBP),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
| MOV -0x148(%RBP),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| CMP %R15D,-0xf0(%RBP) | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 | N/A |
| MOV -0x160(%RBP),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
| MOV -0x30(%RBP),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
| MOV -0x38(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
| JAE 4064c0 <main+0x26c0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
