| Loop Id: 29 | Module: attention-avx512 | Source: attention.cpp:26-306 [...] | Coverage: 0.05% |
|---|
| Loop Id: 29 | Module: attention-avx512 | Source: attention.cpp:26-306 [...] | Coverage: 0.05% |
|---|
0x6b90 MOV 0xc0(%RSP),%ESI |
0x6b97 MOV 0x50(%RSP),%RCX |
0x6b9c INC %ESI |
0x6b9e ADD %RCX,%RDI |
0x6ba1 CMP %ECX,%ESI |
0x6ba3 JE 6b60 |
0x6ba5 MOV 0x80(%RSP),%RAX |
0x6bad CMP $0x1,%R14 |
0x6bb1 MOV %ESI,%R12D |
0x6bb4 MOV %ESI,%R10D |
0x6bb7 MOV %ESI,0xc0(%RSP) |
0x6bbe SETNE %R11B |
0x6bc2 MOV %RAX,%R9 |
0x6bc5 SHR $0x20,%R9 |
0x6bc9 SETNE %R9B |
0x6bcd IMUL %ECX,%R12D |
0x6bd1 IMUL %R14D,%R10D |
0x6bd5 ADD %EAX,%R12D |
0x6bd8 SETB %R12B |
0x6bdc OR %R11B,%R9B |
0x6bdf XOR %R11D,%R11D |
0x6be2 OR %R12B,%R9B |
0x6be5 JMP 6c1a |
(30) 0x6bf0 MOV 0x198(%RSP),%RSI |
(30) 0x6bf8 MOV 0x158(%RSP),%R13 |
(30) 0x6c00 VCVTSD2SS %XMM0,%XMM0,%XMM0 |
(30) 0x6c04 LEA (%R10,%R11,1),%R12D |
(30) 0x6c08 INC %R11 |
(30) 0x6c0b VMOVSS %XMM0,(%RSI,%R12,4) |
(30) 0x6c11 CMP %R15,%R11 |
(30) 0x6c14 JE 6b90 |
(30) 0x6c1a CMPL $0x10,0x58(%RSP) |
(30) 0x6c1f VXORPS %XMM0,%XMM0,%XMM0 |
(30) 0x6c23 JB 6c39 |
(30) 0x6c25 MOV %R11D,%R12D |
(30) 0x6c28 ADD 0x80(%RSP),%R12D |
(30) 0x6c30 SETB %R12B |
(30) 0x6c34 OR %R9B,%R12B |
(30) 0x6c37 JE 6c90 |
(30) 0x6c39 MOV 0x130(%RSP),%RCX |
(30) 0x6c41 MOV 0x58(%RSP),%R8 |
(30) 0x6c46 MOV 0x100(%RSP),%RAX |
(30) 0x6c4e XOR %R13D,%R13D |
(30) 0x6c51 MOV %R14,%R12 |
(30) 0x6c54 IMUL %R13,%R12 |
(30) 0x6c58 ADD %R11,%R12 |
(30) 0x6c5b NOPL (%RAX,%RAX,1) |
(27) 0x6c60 LEA (%RDI,%R13,1),%ESI |
(27) 0x6c64 INC %R13 |
(27) 0x6c67 VMOVSS (%RAX,%RSI,4),%XMM1 |
(27) 0x6c6c MOV %R12D,%ESI |
(27) 0x6c6f VMOVSS (%RCX,%RSI,4),%XMM2 |
(27) 0x6c74 ADD %R14,%R12 |
(27) 0x6c77 VCVTSS2SD %XMM1,%XMM1,%XMM1 |
(27) 0x6c7b VCVTSS2SD %XMM2,%XMM2,%XMM2 |
(27) 0x6c7f VFMADD231SD %XMM2,%XMM1,%XMM0 |
(27) 0x6c84 CMP %R13,%R8 |
(27) 0x6c87 JNE 6c60 |
(30) 0x6c89 JMP 6bf0 |
(30) 0x6c90 MOV 0x130(%RSP),%RCX |
(30) 0x6c98 MOV 0x100(%RSP),%RAX |
(30) 0x6ca0 VXORPS %XMM0,%XMM0,%XMM0 |
(30) 0x6ca4 VXORPS %XMM1,%XMM1,%XMM1 |
(30) 0x6ca8 VXORPS %XMM2,%XMM2,%XMM2 |
(30) 0x6cac VPXOR %XMM3,%XMM3,%XMM3 |
(30) 0x6cb0 XOR %R13D,%R13D |
(30) 0x6cb3 NOPW %CS:(%RAX,%RAX,1) |
(31) 0x6cc0 LEA (%RDI,%R13,1),%R12D |
(31) 0x6cc4 VCVTPS2PD (%RAX,%R12,4),%YMM4 |
(31) 0x6cca VCVTPS2PD 0x10(%RAX,%R12,4),%YMM5 |
(31) 0x6cd1 VCVTPS2PD 0x20(%RAX,%R12,4),%YMM6 |
(31) 0x6cd8 VCVTPS2PD 0x30(%RAX,%R12,4),%YMM7 |
(31) 0x6cdf LEA (%R11,%R13,1),%R12D |
(31) 0x6ce3 ADD $0x10,%R13 |
(31) 0x6ce7 VCVTPS2PD (%RCX,%R12,4),%YMM8 |
(31) 0x6ced VCVTPS2PD 0x10(%RCX,%R12,4),%YMM9 |
(31) 0x6cf4 VFMADD231PD %YMM8,%YMM4,%YMM0 |
(31) 0x6cf9 VFMADD231PD %YMM9,%YMM5,%YMM1 |
(31) 0x6cfe VCVTPS2PD 0x20(%RCX,%R12,4),%YMM8 |
(31) 0x6d05 VCVTPS2PD 0x30(%RCX,%R12,4),%YMM5 |
(31) 0x6d0c VFMADD231PD %YMM8,%YMM6,%YMM2 |
(31) 0x6d11 VFMADD231PD %YMM5,%YMM7,%YMM3 |
(31) 0x6d16 CMP %R13,%RDX |
(31) 0x6d19 JNE 6cc0 |
(30) 0x6d1b VADDPD %YMM0,%YMM1,%YMM0 |
(30) 0x6d1f VADDPD %YMM2,%YMM3,%YMM1 |
(30) 0x6d23 CMPQ $0,0x18(%RSP) |
(30) 0x6d29 MOV 0x58(%RSP),%R8 |
(30) 0x6d2e MOV %RDX,%R13 |
(30) 0x6d31 VADDPD %YMM0,%YMM1,%YMM0 |
(30) 0x6d35 VEXTRACTF128 $0x1,%YMM0,%XMM1 |
(30) 0x6d3b VADDPD %XMM1,%XMM0,%XMM0 |
(30) 0x6d3f VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 |
(30) 0x6d44 VADDSD %XMM1,%XMM0,%XMM0 |
(30) 0x6d48 JNE 6c51 |
(30) 0x6d4e JMP 6bf0 |
/home/eoseret/llm-attention/attention.cpp: 26 - 306 |
-------------------------------------------------------------------------------- |
26: for (unsigned int i = 0; i < M; ++i) { |
27: for (unsigned int j = 0; j < N; ++j) { //vectorized |
28: double sum = 0.0; |
29: #pragma clang loop vectorize(enable) |
30: for (unsigned int k = 0; k < K; ++k) { //vectorized |
31: sum += (double)A[i * K + k] * (double)B[k * N + j]; |
32: } |
33: C[i * N + j] = alpha * static_cast<float>(sum); |
[...] |
306: for (size_t r = 0; r < rept; r++) |
| Coverage (%) | Name | Source Location | Module |
|---|
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.00 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 13.63 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.15 |
| Bottlenecks | micro-operation queue, |
| Function | main |
| Source | attention.cpp:26-26,attention.cpp:306-306 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 2.88 |
| CQA cycles if no scalar integer | 2.88 |
| CQA cycles if FP arith vectorized | 2.88 |
| CQA cycles if fully vectorized | 0.21 |
| Front-end cycles | 2.88 |
| P0 cycles | 2.50 |
| P1 cycles | 2.50 |
| P2 cycles | 2.50 |
| P3 cycles | 2.50 |
| P4 cycles | 2.50 |
| P5 cycles | 2.50 |
| P6 cycles | 1.00 |
| P7 cycles | 1.00 |
| P8 cycles | 1.00 |
| P9 cycles | 1.00 |
| P10 cycles | 0.00 |
| P11 cycles | 0.00 |
| P12 cycles | 0.00 |
| P13 cycles | 0.00 |
| P14 cycles | 0.00 |
| P15 cycles | 0.00 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | 0 |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 23.00 |
| Nb uops | 23.00 |
| Nb loads | 3.00 |
| Nb stores | 1.00 |
| Nb stack references | 3.00 |
| FLOP/cycle | 0.00 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 8.35 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 20.00 |
| Bytes stored | 4.00 |
| Stride 0 | 0.00 |
| Stride 1 | 0.00 |
| Stride n | 0.00 |
| Stride unknown | 2.00 |
| Stride indirect | 0.00 |
| Vectorization ratio all | 0.00 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | 0.00 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 0.00 |
| Vector-efficiency ratio all | 8.85 |
| Vector-efficiency ratio load | 9.38 |
| Vector-efficiency ratio store | 6.25 |
| Vector-efficiency ratio mul | 6.25 |
| Vector-efficiency ratio add_sub | 9.38 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 9.38 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.00 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 13.63 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.15 |
| Bottlenecks | micro-operation queue, |
| Function | main |
| Source | attention.cpp:26-26,attention.cpp:306-306 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 2.88 |
| CQA cycles if no scalar integer | 2.88 |
| CQA cycles if FP arith vectorized | 2.88 |
| CQA cycles if fully vectorized | 0.21 |
| Front-end cycles | 2.88 |
| P0 cycles | 2.50 |
| P1 cycles | 2.50 |
| P2 cycles | 2.50 |
| P3 cycles | 2.50 |
| P4 cycles | 2.50 |
| P5 cycles | 2.50 |
| P6 cycles | 1.00 |
| P7 cycles | 1.00 |
| P8 cycles | 1.00 |
| P9 cycles | 1.00 |
| P10 cycles | 0.00 |
| P11 cycles | 0.00 |
| P12 cycles | 0.00 |
| P13 cycles | 0.00 |
| P14 cycles | 0.00 |
| P15 cycles | 0.00 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | 0 |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 23.00 |
| Nb uops | 23.00 |
| Nb loads | 3.00 |
| Nb stores | 1.00 |
| Nb stack references | 3.00 |
| FLOP/cycle | 0.00 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 8.35 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 20.00 |
| Bytes stored | 4.00 |
| Stride 0 | 0.00 |
| Stride 1 | 0.00 |
| Stride n | 0.00 |
| Stride unknown | 2.00 |
| Stride indirect | 0.00 |
| Vectorization ratio all | 0.00 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | 0.00 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 0.00 |
| Vector-efficiency ratio all | 8.85 |
| Vector-efficiency ratio load | 9.38 |
| Vector-efficiency ratio store | 6.25 |
| Vector-efficiency ratio mul | 6.25 |
| Vector-efficiency ratio add_sub | 9.38 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 9.38 |
| Path / |
| Function | main |
| Source file and lines | attention.cpp:26-306 |
| Module | attention-avx512 |
| nb instructions | 23 |
| nb uops | 23 |
| loop length | 87 |
| used x86 registers | 10 |
| used mmx registers | 0 |
| used xmm registers | 0 |
| used ymm registers | 0 |
| used zmm registers | 0 |
| nb stack references | 3 |
| micro-operation queue | 2.88 cycles |
| front end | 2.88 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 2.50 | 2.50 | 2.50 | 2.50 | 2.50 | 2.50 | 1.00 | 1.00 | 1.00 | 1.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| cycles | 2.50 | 2.50 | 2.50 | 2.50 | 2.50 | 2.50 | 1.00 | 1.00 | 1.00 | 1.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| Cycles executing div or sqrt instructions | NA |
| Longest recurrence chain latency (RecMII) | 0.00 |
| Front-end | 2.88 |
| Dispatch | 2.50 |
| Data deps. | 0.00 |
| Overall L1 | 2.88 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 8% |
| load | 9% |
| store | 6% |
| mul | 6% |
| add-sub | 9% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 9% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOV 0xc0(%RSP),%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (6.3%) |
| MOV 0x50(%RSP),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (12.5%) |
| INC %ESI | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (6.3%) |
| ADD %RCX,%RDI | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (12.5%) |
| CMP %ECX,%ESI | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (6.3%) |
| JE 6b60 <main+0x2ec0> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-1 | N/A |
| MOV 0x80(%RSP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | N/A |
| CMP $0x1,%R14 | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (12.5%) |
| MOV %ESI,%R12D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
| MOV %ESI,%R10D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | scal (6.3%) |
| MOV %ESI,0xc0(%RSP) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| SETNE %R11B | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| MOV %RAX,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | scal (12.5%) |
| SHR $0x20,%R9 | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| SETNE %R9B | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| IMUL %ECX,%R12D | 1 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 | N/A |
| IMUL %R14D,%R10D | 1 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 | scal (6.3%) |
| ADD %EAX,%R12D | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| SETB %R12B | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| OR %R11B,%R9B | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| XOR %R11D,%R11D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | scal (6.3%) |
| OR %R12B,%R9B | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| JMP 6c1a <main+0x2f7a> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| Function | main |
| Source file and lines | attention.cpp:26-306 |
| Module | attention-avx512 |
| nb instructions | 23 |
| nb uops | 23 |
| loop length | 87 |
| used x86 registers | 10 |
| used mmx registers | 0 |
| used xmm registers | 0 |
| used ymm registers | 0 |
| used zmm registers | 0 |
| nb stack references | 3 |
| micro-operation queue | 2.88 cycles |
| front end | 2.88 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 2.50 | 2.50 | 2.50 | 2.50 | 2.50 | 2.50 | 1.00 | 1.00 | 1.00 | 1.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| cycles | 2.50 | 2.50 | 2.50 | 2.50 | 2.50 | 2.50 | 1.00 | 1.00 | 1.00 | 1.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| Cycles executing div or sqrt instructions | NA |
| Longest recurrence chain latency (RecMII) | 0.00 |
| Front-end | 2.88 |
| Dispatch | 2.50 |
| Data deps. | 0.00 |
| Overall L1 | 2.88 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 8% |
| load | 9% |
| store | 6% |
| mul | 6% |
| add-sub | 9% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 9% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOV 0xc0(%RSP),%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (6.3%) |
| MOV 0x50(%RSP),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (12.5%) |
| INC %ESI | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (6.3%) |
| ADD %RCX,%RDI | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (12.5%) |
| CMP %ECX,%ESI | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (6.3%) |
| JE 6b60 <main+0x2ec0> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-1 | N/A |
| MOV 0x80(%RSP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | N/A |
| CMP $0x1,%R14 | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (12.5%) |
| MOV %ESI,%R12D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
| MOV %ESI,%R10D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | scal (6.3%) |
| MOV %ESI,0xc0(%RSP) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| SETNE %R11B | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| MOV %RAX,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | scal (12.5%) |
| SHR $0x20,%R9 | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| SETNE %R9B | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| IMUL %ECX,%R12D | 1 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 | N/A |
| IMUL %R14D,%R10D | 1 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 | scal (6.3%) |
| ADD %EAX,%R12D | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| SETB %R12B | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| OR %R11B,%R9B | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| XOR %R11D,%R11D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | scal (6.3%) |
| OR %R12B,%R9B | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| JMP 6c1a <main+0x2f7a> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
