| Loop Id: 66 | Module: attention-aocc-znver5-512 | Source: attention_v2.cpp:26-33 | Coverage: 0.46% |
|---|
| Loop Id: 66 | Module: attention-aocc-znver5-512 | Source: attention_v2.cpp:26-33 | Coverage: 0.46% |
|---|
0x5e10 MOV 0x68(%RSP),%R9 |
0x5e15 VCVTSD2SS %XMM0,%XMM0,%XMM0 |
0x5e19 LEA (%RSI,%RDI,1),%R8D |
0x5e1d INC %RDI |
0x5e20 VMOVSS %XMM0,(%R9,%R8,4) |
0x5e26 CMP %R15,%RDI |
0x5e29 JE 5dc0 |
0x5e2b VXORPS %XMM0,%XMM0,%XMM0 |
0x5e2f CMP $0x4,%R15D |
0x5e33 JB 5e49 |
0x5e35 MOV %EDI,%R8D |
0x5e38 ADD 0xb8(%RSP),%R8D |
0x5e40 SETB %R8B |
0x5e44 OR %DL,%R8B |
0x5e47 JE 5e90 |
0x5e49 XOR %R8D,%R8D |
0x5e4c MOV %R14,%R9 |
0x5e4f IMUL %R8,%R9 |
0x5e53 ADD %RDI,%R9 |
0x5e56 NOPW %CS:(%RAX,%RAX,1) |
(31) 0x5e60 LEA (%RAX,%R8,1),%R10D |
(31) 0x5e64 INC %R8 |
(31) 0x5e67 VMOVSS (%RBX,%R10,4),%XMM1 |
(31) 0x5e6d MOV %R9D,%R10D |
(31) 0x5e70 VMOVSS (%R11,%R10,4),%XMM2 |
(31) 0x5e76 ADD %R14,%R9 |
(31) 0x5e79 VCVTSS2SD %XMM1,%XMM1,%XMM1 |
(31) 0x5e7d VCVTSS2SD %XMM2,%XMM2,%XMM2 |
(31) 0x5e81 VFMADD231SD %XMM2,%XMM1,%XMM0 |
(31) 0x5e86 CMP %R8,%R15 |
(31) 0x5e89 JNE 5e60 |
0x5e8b JMP 5e10 |
0x5e90 CMP $0x20,%R15D |
0x5e94 JAE 5e9e |
0x5e96 XOR %R9D,%R9D |
0x5e99 JMP 5f7f |
0x5e9e MOV 0xa0(%RSP),%R10 |
0x5ea6 VXORPS %XMM0,%XMM0,%XMM0 |
0x5eaa VXORPS %XMM1,%XMM1,%XMM1 |
0x5eae VXORPS %XMM2,%XMM2,%XMM2 |
0x5eb2 VPXOR %XMM3,%XMM3,%XMM3 |
0x5eb6 XOR %R8D,%R8D |
0x5eb9 NOPL (%RAX) |
(67) 0x5ec0 LEA (%RAX,%R8,1),%R9D |
(67) 0x5ec4 VCVTPS2PD (%RBX,%R9,4),%ZMM4 |
(67) 0x5ecb VCVTPS2PD 0x20(%RBX,%R9,4),%ZMM5 |
(67) 0x5ed3 VCVTPS2PD 0x40(%RBX,%R9,4),%ZMM6 |
(67) 0x5edb VCVTPS2PD 0x60(%RBX,%R9,4),%ZMM7 |
(67) 0x5ee3 LEA (%RDI,%R8,1),%R9D |
(67) 0x5ee7 ADD $0x20,%R8 |
(67) 0x5eeb VCVTPS2PD (%R11,%R9,4),%ZMM8 |
(67) 0x5ef2 VCVTPS2PD 0x20(%R11,%R9,4),%ZMM9 |
(67) 0x5efa VFMADD231PD %ZMM8,%ZMM4,%ZMM0 |
(67) 0x5f00 VCVTPS2PD 0x40(%R11,%R9,4),%ZMM8 |
(67) 0x5f08 VFMADD231PD %ZMM9,%ZMM5,%ZMM1 |
(67) 0x5f0e VCVTPS2PD 0x60(%R11,%R9,4),%ZMM5 |
(67) 0x5f16 VFMADD231PD %ZMM8,%ZMM6,%ZMM2 |
(67) 0x5f1c VFMADD231PD %ZMM5,%ZMM7,%ZMM3 |
(67) 0x5f22 CMP %R8,%R10 |
(67) 0x5f25 JNE 5ec0 |
0x5f27 VADDPD %ZMM0,%ZMM1,%ZMM0 |
0x5f2d VADDPD %ZMM2,%ZMM3,%ZMM2 |
0x5f33 MOV 0x220(%RSP),%R10 |
0x5f3b VADDPD %ZMM0,%ZMM2,%ZMM0 |
0x5f41 VEXTRACTF64X4 $0x1,%ZMM0,%YMM1 |
0x5f48 VADDPD %ZMM1,%ZMM0,%ZMM0 |
0x5f4e VEXTRACTF128 $0x1,%YMM0,%XMM1 |
0x5f54 VADDPD %XMM1,%XMM0,%XMM0 |
0x5f58 VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 |
0x5f5d VADDSD %XMM1,%XMM0,%XMM0 |
0x5f61 TEST %R10,%R10 |
0x5f64 JE 5e10 |
0x5f6a MOV 0xa0(%RSP),%R8 |
0x5f72 MOV %R8,%R9 |
0x5f75 CMP $0x4,%R10D |
0x5f79 JB 5e4c |
0x5f7f MOV 0x228(%RSP),%R10 |
0x5f87 VMOVQ %XMM0,%XMM0 |
0x5f8b NOPL (%RAX,%RAX,1) |
(68) 0x5f90 LEA (%RAX,%R9,1),%R8D |
(68) 0x5f94 VCVTPS2PD (%RBX,%R8,4),%YMM1 |
(68) 0x5f9a LEA (%RDI,%R9,1),%R8D |
(68) 0x5f9e ADD $0x4,%R9 |
(68) 0x5fa2 VCVTPS2PD (%R11,%R8,4),%YMM2 |
(68) 0x5fa8 VFMADD231PD %YMM2,%YMM1,%YMM0 |
(68) 0x5fad CMP %R9,%R10 |
(68) 0x5fb0 JNE 5f90 |
0x5fb2 VEXTRACTF128 $0x1,%YMM0,%XMM1 |
0x5fb8 CMPQ $0,0x230(%RSP) |
0x5fc1 MOV %R10,%R8 |
0x5fc4 VADDPD %XMM1,%XMM0,%XMM0 |
0x5fc8 VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 |
0x5fcd VADDSD %XMM1,%XMM0,%XMM0 |
0x5fd1 JNE 5e4c |
0x5fd7 JMP 5e10 |
/home/eoseret/llm-attention/attention_v2.cpp: 26 - 33 |
-------------------------------------------------------------------------------- |
26: for (unsigned int i = 0; i < M; ++i) { |
27: for (unsigned int j = 0; j < N; ++j) { //vectorized |
28: double sum = 0.0; |
29: #pragma clang loop vectorize(enable) |
30: for (unsigned int k = 0; k < K; ++k) { //vectorized |
31: sum += (double)A[i * K + k] * (double)B[k * N + j]; |
32: } |
33: C[i * N + j] = alpha * static_cast<float>(sum); |
| Coverage (%) | Name | Source Location | Module |
|---|
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.64 |
| CQA speedup if FP arith vectorized | 1.06 |
| CQA speedup if fully vectorized | 1.36 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.64 |
| Bottlenecks | micro-operation queue, |
| Function | main |
| Source | attention_v2.cpp:26-27,attention_v2.cpp:30-30,attention_v2.cpp:33-33 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 7.38 |
| CQA cycles if no scalar integer | 4.50 |
| CQA cycles if FP arith vectorized | 6.97 |
| CQA cycles if fully vectorized | 5.41 |
| Front-end cycles | 7.38 |
| P0 cycles | 3.83 |
| P1 cycles | 3.83 |
| P2 cycles | 3.83 |
| P3 cycles | 3.83 |
| P4 cycles | 3.83 |
| P5 cycles | 3.83 |
| P6 cycles | 2.00 |
| P7 cycles | 2.00 |
| P8 cycles | 2.00 |
| P9 cycles | 2.00 |
| P10 cycles | 3.00 |
| P11 cycles | 3.00 |
| P12 cycles | 4.50 |
| P13 cycles | 4.50 |
| P14 cycles | 0.50 |
| P15 cycles | 0.50 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 59.00 |
| Nb uops | 59.00 |
| Nb loads | 7.00 |
| Nb stores | 1.00 |
| Nb stack references | 6.00 |
| FLOP/cycle | 5.15 |
| Nb FLOP add-sub | 38.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 7.59 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 52.00 |
| Bytes stored | 4.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 55.17 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | 75.00 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 58.82 |
| Vector-efficiency ratio all | 29.74 |
| Vector-efficiency ratio load | 12.50 |
| Vector-efficiency ratio store | 6.25 |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | 59.38 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 20.22 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.64 |
| CQA speedup if FP arith vectorized | 1.06 |
| CQA speedup if fully vectorized | 1.36 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.64 |
| Bottlenecks | micro-operation queue, |
| Function | main |
| Source | attention_v2.cpp:26-27,attention_v2.cpp:30-30,attention_v2.cpp:33-33 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 7.38 |
| CQA cycles if no scalar integer | 4.50 |
| CQA cycles if FP arith vectorized | 6.97 |
| CQA cycles if fully vectorized | 5.41 |
| Front-end cycles | 7.38 |
| P0 cycles | 3.83 |
| P1 cycles | 3.83 |
| P2 cycles | 3.83 |
| P3 cycles | 3.83 |
| P4 cycles | 3.83 |
| P5 cycles | 3.83 |
| P6 cycles | 2.00 |
| P7 cycles | 2.00 |
| P8 cycles | 2.00 |
| P9 cycles | 2.00 |
| P10 cycles | 3.00 |
| P11 cycles | 3.00 |
| P12 cycles | 4.50 |
| P13 cycles | 4.50 |
| P14 cycles | 0.50 |
| P15 cycles | 0.50 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 59.00 |
| Nb uops | 59.00 |
| Nb loads | 7.00 |
| Nb stores | 1.00 |
| Nb stack references | 6.00 |
| FLOP/cycle | 5.15 |
| Nb FLOP add-sub | 38.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 7.59 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 52.00 |
| Bytes stored | 4.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 55.17 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | 75.00 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 58.82 |
| Vector-efficiency ratio all | 29.74 |
| Vector-efficiency ratio load | 12.50 |
| Vector-efficiency ratio store | 6.25 |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | 59.38 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 20.22 |
| Path / |
| Function | main |
| Source file and lines | attention_v2.cpp:26-33 |
| Module | attention-aocc-znver5-512 |
| nb instructions | 59 |
| nb uops | 59 |
| loop length | 277 |
| used x86 registers | 10 |
| used mmx registers | 0 |
| used xmm registers | 4 |
| used ymm registers | 2 |
| used zmm registers | 4 |
| nb stack references | 6 |
| micro-operation queue | 7.38 cycles |
| front end | 7.38 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 3.83 | 3.83 | 3.83 | 3.83 | 3.83 | 3.83 | 2.00 | 2.00 | 2.00 | 2.00 | 3.00 | 3.00 | 4.50 | 4.50 | 0.50 | 0.50 |
| cycles | 3.83 | 3.83 | 3.83 | 3.83 | 3.83 | 3.83 | 2.00 | 2.00 | 2.00 | 2.00 | 3.00 | 3.00 | 4.50 | 4.50 | 0.50 | 0.50 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 7.38 |
| Dispatch | 4.50 |
| Overall L1 | 7.38 |
| all | 10% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 14% |
| all | 78% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 75% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 90% |
| all | 55% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 75% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 58% |
| all | 11% |
| load | 12% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 11% |
| all | 39% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 59% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 26% |
| all | 29% |
| load | 12% |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 59% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 20% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOV 0x68(%RSP),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | N/A |
| VCVTSD2SS %XMM0,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| LEA (%RSI,%RDI,1),%R8D | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| INC %RDI | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| VMOVSS %XMM0,(%R9,%R8,4) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (6.3%) |
| CMP %R15,%RDI | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (12.5%) |
| JE 5dc0 <main+0x1460> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-0.50 | N/A |
| VXORPS %XMM0,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| CMP $0x4,%R15D | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (6.3%) |
| JB 5e49 <main+0x14e9> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-0.50 | N/A |
| MOV %EDI,%R8D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.13 | N/A |
| ADD 0xb8(%RSP),%R8D | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| SETB %R8B | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| OR %DL,%R8B | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| JE 5e90 <main+0x1530> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-0.50 | N/A |
| XOR %R8D,%R8D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
| MOV %R14,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.13 | N/A |
| IMUL %R8,%R9 | 1 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 | N/A |
| ADD %RDI,%R9 | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.13 | N/A |
| JMP 5e10 <main+0x14b0> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| CMP $0x20,%R15D | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (6.3%) |
| JAE 5e9e <main+0x153e> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-0.50 | N/A |
| XOR %R9D,%R9D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
| JMP 5f7f <main+0x161f> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| MOV 0xa0(%RSP),%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (12.5%) |
| VXORPS %XMM0,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| VXORPS %XMM1,%XMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| VXORPS %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| VPXOR %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| XOR %R8D,%R8D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
| NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.13 | N/A |
| VADDPD %ZMM0,%ZMM1,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 2 | 0.50 | vect (100.0%) |
| VADDPD %ZMM2,%ZMM3,%ZMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 2 | 0.50 | vect (100.0%) |
| MOV 0x220(%RSP),%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (12.5%) |
| VADDPD %ZMM0,%ZMM2,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 2 | 0.50 | vect (100.0%) |
| VEXTRACTF64X4 $0x1,%ZMM0,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 4 | 0.25 | vect (50.0%) |
| VADDPD %ZMM1,%ZMM0,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 2 | 0.50 | vect (100.0%) |
| VEXTRACTF128 $0x1,%YMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 2 | 0.25 | vect (25.0%) |
| VADDPD %XMM1,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 2 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1-2 | 0.25 | vect (25.0%) |
| VADDSD %XMM1,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 2 | 0.50 | scal (12.5%) |
| TEST %R10,%R10 | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (12.5%) |
| JE 5e10 <main+0x14b0> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-0.50 | N/A |
| MOV 0xa0(%RSP),%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | N/A |
| MOV %R8,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.13 | N/A |
| CMP $0x4,%R10D | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (6.3%) |
| JB 5e4c <main+0x14ec> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-0.50 | N/A |
| MOV 0x228(%RSP),%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (12.5%) |
| VMOVQ %XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.13 | N/A |
| VEXTRACTF128 $0x1,%YMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 2 | 0.25 | vect (25.0%) |
| CMPQ $0,0x230(%RSP) | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| MOV %R10,%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.13 | N/A |
| VADDPD %XMM1,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 2 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1-2 | 0.25 | vect (25.0%) |
| VADDSD %XMM1,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 2 | 0.50 | scal (12.5%) |
| JNE 5e4c <main+0x14ec> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-0.50 | N/A |
| JMP 5e10 <main+0x14b0> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| Function | main |
| Source file and lines | attention_v2.cpp:26-33 |
| Module | attention-aocc-znver5-512 |
| nb instructions | 59 |
| nb uops | 59 |
| loop length | 277 |
| used x86 registers | 10 |
| used mmx registers | 0 |
| used xmm registers | 4 |
| used ymm registers | 2 |
| used zmm registers | 4 |
| nb stack references | 6 |
| micro-operation queue | 7.38 cycles |
| front end | 7.38 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 3.83 | 3.83 | 3.83 | 3.83 | 3.83 | 3.83 | 2.00 | 2.00 | 2.00 | 2.00 | 3.00 | 3.00 | 4.50 | 4.50 | 0.50 | 0.50 |
| cycles | 3.83 | 3.83 | 3.83 | 3.83 | 3.83 | 3.83 | 2.00 | 2.00 | 2.00 | 2.00 | 3.00 | 3.00 | 4.50 | 4.50 | 0.50 | 0.50 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 7.38 |
| Dispatch | 4.50 |
| Overall L1 | 7.38 |
| all | 10% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 14% |
| all | 78% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 75% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 90% |
| all | 55% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 75% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 58% |
| all | 11% |
| load | 12% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 11% |
| all | 39% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 59% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 26% |
| all | 29% |
| load | 12% |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 59% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 20% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOV 0x68(%RSP),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | N/A |
| VCVTSD2SS %XMM0,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| LEA (%RSI,%RDI,1),%R8D | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| INC %RDI | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| VMOVSS %XMM0,(%R9,%R8,4) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (6.3%) |
| CMP %R15,%RDI | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (12.5%) |
| JE 5dc0 <main+0x1460> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-0.50 | N/A |
| VXORPS %XMM0,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| CMP $0x4,%R15D | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (6.3%) |
| JB 5e49 <main+0x14e9> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-0.50 | N/A |
| MOV %EDI,%R8D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.13 | N/A |
| ADD 0xb8(%RSP),%R8D | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| SETB %R8B | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| OR %DL,%R8B | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| JE 5e90 <main+0x1530> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-0.50 | N/A |
| XOR %R8D,%R8D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
| MOV %R14,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.13 | N/A |
| IMUL %R8,%R9 | 1 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 | N/A |
| ADD %RDI,%R9 | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.13 | N/A |
| JMP 5e10 <main+0x14b0> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| CMP $0x20,%R15D | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (6.3%) |
| JAE 5e9e <main+0x153e> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-0.50 | N/A |
| XOR %R9D,%R9D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
| JMP 5f7f <main+0x161f> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| MOV 0xa0(%RSP),%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (12.5%) |
| VXORPS %XMM0,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| VXORPS %XMM1,%XMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| VXORPS %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| VPXOR %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| XOR %R8D,%R8D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
| NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.13 | N/A |
| VADDPD %ZMM0,%ZMM1,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 2 | 0.50 | vect (100.0%) |
| VADDPD %ZMM2,%ZMM3,%ZMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 2 | 0.50 | vect (100.0%) |
| MOV 0x220(%RSP),%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (12.5%) |
| VADDPD %ZMM0,%ZMM2,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 2 | 0.50 | vect (100.0%) |
| VEXTRACTF64X4 $0x1,%ZMM0,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 4 | 0.25 | vect (50.0%) |
| VADDPD %ZMM1,%ZMM0,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 2 | 0.50 | vect (100.0%) |
| VEXTRACTF128 $0x1,%YMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 2 | 0.25 | vect (25.0%) |
| VADDPD %XMM1,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 2 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1-2 | 0.25 | vect (25.0%) |
| VADDSD %XMM1,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 2 | 0.50 | scal (12.5%) |
| TEST %R10,%R10 | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (12.5%) |
| JE 5e10 <main+0x14b0> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-0.50 | N/A |
| MOV 0xa0(%RSP),%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | N/A |
| MOV %R8,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.13 | N/A |
| CMP $0x4,%R10D | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (6.3%) |
| JB 5e4c <main+0x14ec> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-0.50 | N/A |
| MOV 0x228(%RSP),%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (12.5%) |
| VMOVQ %XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.13 | N/A |
| VEXTRACTF128 $0x1,%YMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 2 | 0.25 | vect (25.0%) |
| CMPQ $0,0x230(%RSP) | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| MOV %R10,%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.13 | N/A |
| VADDPD %XMM1,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 2 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 1-2 | 0.25 | vect (25.0%) |
| VADDSD %XMM1,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 2 | 0.50 | scal (12.5%) |
| JNE 5e4c <main+0x14ec> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-0.50 | N/A |
| JMP 5e10 <main+0x14b0> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
