| Loop Id: 51 | Module: attention-clang-gnr256 | Source: attention_v2.cpp:26-292 [...] | Coverage: 0.04% |
|---|
| Loop Id: 51 | Module: attention-clang-gnr256 | Source: attention_v2.cpp:26-292 [...] | Coverage: 0.04% |
|---|
0x62e4 MOV 0x160(%RSP),%ECX |
0x62eb INC %ECX |
0x62ed MOV 0x50(%RSP),%RDX |
0x62f2 ADD %RDX,%RAX |
0x62f5 CMP %EDX,%ECX |
0x62f7 JE 661f |
0x62fd MOV %ECX,%ESI |
0x62ff IMUL %EDX,%ESI |
0x6302 MOV %ECX,0x160(%RSP) |
0x6309 IMUL %R8D,%ECX |
0x630d MOV %RCX,0x60(%RSP) |
0x6312 ADD 0x2e0(%RSP),%ESI |
0x6319 SETB %CL |
0x631c OR 0x46(%RSP),%CL |
0x6320 MOV %CL,0x20(%RSP) |
0x6324 XOR %EDI,%EDI |
0x6326 JMP 6352 |
(52) 0x6328 VCVTSD2SS %XMM0,%XMM0,%XMM0 |
(52) 0x632c MOV 0x60(%RSP),%RCX |
(52) 0x6331 LEA (%RCX,%RDI,1),%R8D |
(52) 0x6335 MOV 0x198(%RSP),%R9 |
(52) 0x633d VMOVSS %XMM0,(%R9,%R8,4) |
(52) 0x6343 INC %RDI |
(52) 0x6346 CMP 0x28(%RSP),%RDI |
(52) 0x634b MOV 0x38(%RSP),%R8 |
(52) 0x6350 JE 62e4 |
(52) 0x6352 VXORPS %XMM0,%XMM0,%XMM0 |
(52) 0x6356 CMP $0x4,%R13D |
(52) 0x635a JB 6376 |
(52) 0x635c MOV %EDI,%R8D |
(52) 0x635f ADD 0x2e0(%RSP),%R8D |
(52) 0x6367 SETB %R8B |
(52) 0x636b OR 0x20(%RSP),%R8B |
(52) 0x6370 JE 64e9 |
(52) 0x6376 XOR %R9D,%R9D |
(52) 0x6379 MOV 0x380(%RSP),%R11 |
(52) 0x6381 TEST %R11,%R11 |
(52) 0x6384 JE 63db |
(52) 0x6386 MOV 0x38(%RSP),%R13 |
(52) 0x638b MOV %R13,%R10 |
(52) 0x638e IMUL %R9,%R10 |
(52) 0x6392 ADD %RDI,%R10 |
(52) 0x6395 MOV %R9,%R8 |
(52) 0x6398 MOV 0x80(%RSP),%RSI |
(52) 0x63a0 MOV 0x48(%RSP),%RDX |
(52) 0x63a5 NOPW %CS:(%RAX,%RAX,1) |
(54) 0x63b0 LEA (%RAX,%R8,1),%EBX |
(54) 0x63b4 VMOVSS (%RDX,%RBX,4),%XMM1 |
(54) 0x63b9 VCVTSS2SD %XMM1,%XMM1,%XMM1 |
(54) 0x63bd MOV %R10D,%EBX |
(54) 0x63c0 VMOVSS (%RSI,%RBX,4),%XMM2 |
(54) 0x63c5 VCVTSS2SD %XMM2,%XMM2,%XMM2 |
(54) 0x63c9 VFMADD231SD %XMM2,%XMM1,%XMM0 |
(54) 0x63ce INC %R8 |
(54) 0x63d1 ADD %R13,%R10 |
(54) 0x63d4 DEC %R11 |
(54) 0x63d7 JNE 63b0 |
(52) 0x63d9 JMP 63eb |
(52) 0x63db MOV %R9,%R8 |
(52) 0x63de MOV 0x80(%RSP),%RSI |
(52) 0x63e6 MOV 0x48(%RSP),%RDX |
(52) 0x63eb MOV 0x88(%RSP),%R13 |
(52) 0x63f3 SUB %R13,%R9 |
(52) 0x63f6 CMP $-0x4,%R9 |
(52) 0x63fa MOV 0x220(%RSP),%RCX |
(52) 0x6402 JA 6328 |
(52) 0x6408 LEA 0x3(%R8),%R9 |
(52) 0x640c MOV 0x38(%RSP),%R13 |
(52) 0x6411 IMUL %R13,%R9 |
(52) 0x6415 ADD %RDI,%R9 |
(52) 0x6418 LEA 0x2(%R8),%R10 |
(52) 0x641c IMUL %R13,%R10 |
(52) 0x6420 ADD %RDI,%R10 |
(52) 0x6423 MOV %R13,%R11 |
(52) 0x6426 IMUL %R8,%R11 |
(52) 0x642a ADD %RDI,%R11 |
(52) 0x642d LEA 0x1(%R8),%RBX |
(52) 0x6431 IMUL %R13,%RBX |
(52) 0x6435 ADD %RDI,%RBX |
(52) 0x6438 NOPL (%RAX,%RAX,1) |
(53) 0x6440 LEA (%RAX,%R8,1),%R13D |
(53) 0x6444 VMOVSS (%RDX,%R13,4),%XMM1 |
(53) 0x644a VCVTSS2SD %XMM1,%XMM1,%XMM1 |
(53) 0x644e MOV %R11D,%R13D |
(53) 0x6451 VMOVSS (%RSI,%R13,4),%XMM2 |
(53) 0x6457 VCVTSS2SD %XMM2,%XMM2,%XMM2 |
(53) 0x645b LEA 0x1(%RAX,%R8,1),%R13D |
(53) 0x6460 VMOVSS (%RDX,%R13,4),%XMM3 |
(53) 0x6466 VCVTSS2SD %XMM3,%XMM3,%XMM3 |
(53) 0x646a VFMADD213SD %XMM0,%XMM1,%XMM2 |
(53) 0x646f MOV %EBX,%R13D |
(53) 0x6472 VMOVSS (%RSI,%R13,4),%XMM0 |
(53) 0x6478 VCVTSS2SD %XMM0,%XMM0,%XMM0 |
(53) 0x647c VFMADD213SD %XMM2,%XMM3,%XMM0 |
(53) 0x6481 LEA 0x2(%RAX,%R8,1),%R13D |
(53) 0x6486 VMOVSS (%RDX,%R13,4),%XMM1 |
(53) 0x648c VCVTSS2SD %XMM1,%XMM1,%XMM1 |
(53) 0x6490 MOV %R10D,%R13D |
(53) 0x6493 VMOVSS (%RSI,%R13,4),%XMM2 |
(53) 0x6499 VCVTSS2SD %XMM2,%XMM2,%XMM2 |
(53) 0x649d LEA 0x3(%RAX,%R8,1),%R13D |
(53) 0x64a2 VMOVSS (%RDX,%R13,4),%XMM3 |
(53) 0x64a8 VCVTSS2SD %XMM3,%XMM3,%XMM3 |
(53) 0x64ac VFMADD213SD %XMM0,%XMM1,%XMM2 |
(53) 0x64b1 MOV %R9D,%R13D |
(53) 0x64b4 VMOVSS (%RSI,%R13,4),%XMM0 |
(53) 0x64ba MOV 0x88(%RSP),%R13 |
(53) 0x64c2 VCVTSS2SD %XMM0,%XMM0,%XMM0 |
(53) 0x64c6 VFMADD213SD %XMM2,%XMM3,%XMM0 |
(53) 0x64cb ADD $0x4,%R8 |
(53) 0x64cf ADD %RCX,%R9 |
(53) 0x64d2 ADD %RCX,%R10 |
(53) 0x64d5 ADD %RCX,%R11 |
(53) 0x64d8 ADD %RCX,%RBX |
(53) 0x64db CMP %R8,%R13 |
(53) 0x64de JNE 6440 |
(52) 0x64e4 JMP 6328 |
(52) 0x64e9 CMP $0x10,%R13D |
(52) 0x64ed JAE 6504 |
(52) 0x64ef XOR %R8D,%R8D |
(52) 0x64f2 MOV 0x80(%RSP),%RCX |
(52) 0x64fa MOV 0x48(%RSP),%RDX |
(52) 0x64ff JMP 65cd |
(52) 0x6504 VXORPS %XMM0,%XMM0,%XMM0 |
(52) 0x6508 XOR %R8D,%R8D |
(52) 0x650b VXORPS %XMM1,%XMM1,%XMM1 |
(52) 0x650f VXORPS %XMM2,%XMM2,%XMM2 |
(52) 0x6513 VXORPS %XMM3,%XMM3,%XMM3 |
(52) 0x6517 MOV 0x360(%RSP),%R10 |
(52) 0x651f MOV 0x80(%RSP),%RCX |
(52) 0x6527 MOV 0x48(%RSP),%RDX |
(55) 0x652c LEA (%RAX,%R8,1),%R9D |
(55) 0x6530 VCVTPS2PD (%RDX,%R9,4),%YMM4 |
(55) 0x6536 VCVTPS2PD 0x10(%RDX,%R9,4),%YMM5 |
(55) 0x653d VCVTPS2PD 0x20(%RDX,%R9,4),%YMM6 |
(55) 0x6544 VCVTPS2PD 0x30(%RDX,%R9,4),%YMM7 |
(55) 0x654b LEA (%RDI,%R8,1),%R9D |
(55) 0x654f VCVTPS2PD (%RCX,%R9,4),%YMM8 |
(55) 0x6555 VFMADD231PD %YMM8,%YMM4,%YMM0 |
(55) 0x655a VCVTPS2PD 0x10(%RCX,%R9,4),%YMM4 |
(55) 0x6561 VFMADD231PD %YMM4,%YMM5,%YMM1 |
(55) 0x6566 VCVTPS2PD 0x20(%RCX,%R9,4),%YMM4 |
(55) 0x656d VFMADD231PD %YMM4,%YMM6,%YMM2 |
(55) 0x6572 VCVTPS2PD 0x30(%RCX,%R9,4),%YMM4 |
(55) 0x6579 VFMADD231PD %YMM4,%YMM7,%YMM3 |
(55) 0x657e ADD $0x10,%R8 |
(55) 0x6582 CMP %R8,%R10 |
(55) 0x6585 JNE 652c |
(52) 0x6587 VADDPD %YMM0,%YMM1,%YMM0 |
(52) 0x658b VADDPD %YMM2,%YMM3,%YMM1 |
(52) 0x658f VADDPD %YMM0,%YMM1,%YMM0 |
(52) 0x6593 VEXTRACTF128 $0x1,%YMM0,%XMM1 |
(52) 0x6599 VADDPD %XMM1,%XMM0,%XMM0 |
(52) 0x659d VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 |
(52) 0x65a2 VADDSD %XMM1,%XMM0,%XMM0 |
(52) 0x65a6 CMP %R10D,%R13D |
(52) 0x65a9 JE 6328 |
(52) 0x65af MOV %R10,%R8 |
(52) 0x65b2 MOV %R10,%R9 |
(52) 0x65b5 TESTB $0xc,0x50(%RSP) |
(52) 0x65ba MOV 0x80(%RSP),%RCX |
(52) 0x65c2 MOV 0x48(%RSP),%RDX |
(52) 0x65c7 JE 6379 |
(52) 0x65cd VMOVQ %XMM0,%XMM0 |
(52) 0x65d1 MOV 0x370(%RSP),%R10 |
(56) 0x65d9 LEA (%RAX,%R8,1),%R9D |
(56) 0x65dd VCVTPS2PD (%RDX,%R9,4),%YMM1 |
(56) 0x65e3 LEA (%RDI,%R8,1),%R9D |
(56) 0x65e7 VCVTPS2PD (%RCX,%R9,4),%YMM2 |
(56) 0x65ed VFMADD231PD %YMM2,%YMM1,%YMM0 |
(56) 0x65f2 ADD $0x4,%R8 |
(56) 0x65f6 CMP %R8,%R10 |
(56) 0x65f9 JNE 65d9 |
(52) 0x65fb VEXTRACTF128 $0x1,%YMM0,%XMM1 |
(52) 0x6601 VADDPD %XMM1,%XMM0,%XMM0 |
(52) 0x6605 VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 |
(52) 0x660a VADDSD %XMM1,%XMM0,%XMM0 |
(52) 0x660e MOV %R10,%R9 |
(52) 0x6611 CMP %R10D,%R13D |
(52) 0x6614 JE 6328 |
(52) 0x661a JMP 6379 |
/home/eoseret/llm-attention/attention_v2.cpp: 26 - 292 |
-------------------------------------------------------------------------------- |
26: for (unsigned int i = 0; i < M; ++i) { |
27: for (unsigned int j = 0; j < N; ++j) { //vectorized |
28: double sum = 0.0; |
29: #pragma clang loop vectorize(enable) |
30: for (unsigned int k = 0; k < K; ++k) { //vectorized |
31: sum += (double)A[i * K + k] * (double)B[k * N + j]; |
32: } |
33: C[i * N + j] = alpha * static_cast<float>(sum); |
[...] |
292: start = std::chrono::steady_clock::now(); |
| Coverage (%) | Name | Source Location | Module |
|---|
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.00 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 14.40 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.06 |
| Bottlenecks | P1, |
| Function | main |
| Source | attention_v2.cpp:26-26,attention_v2.cpp:292-292 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 3.00 |
| CQA cycles if no scalar integer | 3.00 |
| CQA cycles if FP arith vectorized | 3.00 |
| CQA cycles if fully vectorized | 0.21 |
| Front-end cycles | 2.83 |
| P0 cycles | 1.80 |
| P1 cycles | 3.00 |
| P2 cycles | 1.33 |
| P3 cycles | 1.33 |
| P4 cycles | 1.50 |
| P5 cycles | 1.80 |
| P6 cycles | 1.80 |
| P7 cycles | 1.50 |
| P8 cycles | 1.50 |
| P9 cycles | 1.50 |
| P10 cycles | 1.60 |
| P11 cycles | 1.33 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 17.00 |
| Nb uops | 17.00 |
| Nb loads | 4.00 |
| Nb stores | 3.00 |
| Nb stack references | 6.00 |
| FLOP/cycle | 0.00 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 10.00 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 17.00 |
| Bytes stored | 13.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 0.00 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | 0.00 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 0.00 |
| Vector-efficiency ratio all | 8.33 |
| Vector-efficiency ratio load | 9.38 |
| Vector-efficiency ratio store | 9.38 |
| Vector-efficiency ratio mul | 6.25 |
| Vector-efficiency ratio add_sub | 9.38 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 6.25 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.00 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 14.40 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.06 |
| Bottlenecks | P1, |
| Function | main |
| Source | attention_v2.cpp:26-26,attention_v2.cpp:292-292 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 3.00 |
| CQA cycles if no scalar integer | 3.00 |
| CQA cycles if FP arith vectorized | 3.00 |
| CQA cycles if fully vectorized | 0.21 |
| Front-end cycles | 2.83 |
| P0 cycles | 1.80 |
| P1 cycles | 3.00 |
| P2 cycles | 1.33 |
| P3 cycles | 1.33 |
| P4 cycles | 1.50 |
| P5 cycles | 1.80 |
| P6 cycles | 1.80 |
| P7 cycles | 1.50 |
| P8 cycles | 1.50 |
| P9 cycles | 1.50 |
| P10 cycles | 1.60 |
| P11 cycles | 1.33 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 17.00 |
| Nb uops | 17.00 |
| Nb loads | 4.00 |
| Nb stores | 3.00 |
| Nb stack references | 6.00 |
| FLOP/cycle | 0.00 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 10.00 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 17.00 |
| Bytes stored | 13.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 0.00 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | 0.00 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 0.00 |
| Vector-efficiency ratio all | 8.33 |
| Vector-efficiency ratio load | 9.38 |
| Vector-efficiency ratio store | 9.38 |
| Vector-efficiency ratio mul | 6.25 |
| Vector-efficiency ratio add_sub | 9.38 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 6.25 |
| Path / |
| Function | main |
| Source file and lines | attention_v2.cpp:26-292 |
| Module | attention-clang-gnr256 |
| nb instructions | 17 |
| nb uops | 17 |
| loop length | 68 |
| used x86 registers | 7 |
| used mmx registers | 0 |
| used xmm registers | 0 |
| used ymm registers | 0 |
| used zmm registers | 0 |
| nb stack references | 6 |
| micro-operation queue | 2.83 cycles |
| front end | 2.83 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 1.80 | 2.00 | 1.33 | 1.33 | 1.50 | 1.80 | 1.80 | 1.50 | 1.50 | 1.50 | 1.60 | 1.33 |
| cycles | 1.80 | 3.00 | 1.33 | 1.33 | 1.50 | 1.80 | 1.80 | 1.50 | 1.50 | 1.50 | 1.60 | 1.33 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 2.83 |
| Dispatch | 3.00 |
| Overall L1 | 3.00 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 8% |
| load | 9% |
| store | 9% |
| mul | 6% |
| add-sub | 9% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 6% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOV 0x160(%RSP),%ECX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (6.3%) |
| INC %ECX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 | scal (6.3%) |
| MOV 0x50(%RSP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
| ADD %RDX,%RAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 | scal (12.5%) |
| CMP %EDX,%ECX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 | scal (6.3%) |
| JE 661f <main+0x390f> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| MOV %ECX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| IMUL %EDX,%ESI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| MOV %ECX,0x160(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| IMUL %R8D,%ECX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (6.3%) |
| MOV %RCX,0x60(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| ADD 0x2e0(%RSP),%ESI | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 | N/A |
| SETB %CL | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| OR 0x46(%RSP),%CL | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1-2 | 0.33 | N/A |
| MOV %CL,0x20(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| XOR %EDI,%EDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | scal (6.3%) |
| JMP 6352 <main+0x3642> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 | N/A |
| Function | main |
| Source file and lines | attention_v2.cpp:26-292 |
| Module | attention-clang-gnr256 |
| nb instructions | 17 |
| nb uops | 17 |
| loop length | 68 |
| used x86 registers | 7 |
| used mmx registers | 0 |
| used xmm registers | 0 |
| used ymm registers | 0 |
| used zmm registers | 0 |
| nb stack references | 6 |
| micro-operation queue | 2.83 cycles |
| front end | 2.83 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 1.80 | 2.00 | 1.33 | 1.33 | 1.50 | 1.80 | 1.80 | 1.50 | 1.50 | 1.50 | 1.60 | 1.33 |
| cycles | 1.80 | 3.00 | 1.33 | 1.33 | 1.50 | 1.80 | 1.80 | 1.50 | 1.50 | 1.50 | 1.60 | 1.33 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 2.83 |
| Dispatch | 3.00 |
| Overall L1 | 3.00 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 8% |
| load | 9% |
| store | 9% |
| mul | 6% |
| add-sub | 9% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 6% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOV 0x160(%RSP),%ECX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (6.3%) |
| INC %ECX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 | scal (6.3%) |
| MOV 0x50(%RSP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
| ADD %RDX,%RAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 | scal (12.5%) |
| CMP %EDX,%ECX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 | scal (6.3%) |
| JE 661f <main+0x390f> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| MOV %ECX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| IMUL %EDX,%ESI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| MOV %ECX,0x160(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| IMUL %R8D,%ECX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (6.3%) |
| MOV %RCX,0x60(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| ADD 0x2e0(%RSP),%ESI | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 | N/A |
| SETB %CL | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| OR 0x46(%RSP),%CL | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1-2 | 0.33 | N/A |
| MOV %CL,0x20(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| XOR %EDI,%EDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | scal (6.3%) |
| JMP 6352 <main+0x3642> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 | N/A |
