| Loop Id: 49 | Module: attention-avx512 | Source: attention.cpp:27-33 | Coverage: 8.74% |
|---|
| Loop Id: 49 | Module: attention-avx512 | Source: attention.cpp:27-33 | Coverage: 8.74% |
|---|
0x4440 VCVTSD2SS %XMM1,%XMM1,%XMM1 |
0x4444 VMULSS %XMM0,%XMM1,%XMM1 |
0x4448 MOV 0x10(%RSP),%RCX |
0x444d ADD %EAX,%ECX |
0x444f MOV 0x50(%RSP),%R15 |
0x4454 VMOVSS %XMM1,(%R15,%RCX,4) |
0x445a INC %RAX |
0x445d CMP 0x20(%RSP),%RAX |
0x4462 MOV 0x48(%RSP),%R13 |
0x4467 JE 43f0 |
0x4469 VXORPS %XMM1,%XMM1,%XMM1 |
0x446d MOV 0x28(%RSP),%RSI |
0x4472 CMP $0x4,%ESI |
0x4475 JB 448d |
0x4477 MOV %EAX,%ECX |
0x4479 ADD 0x80(%RSP),%ECX |
0x4480 SETB %CL |
0x4483 OR 0x40(%RSP),%CL |
0x4487 JE 45f0 |
0x448d XOR %ECX,%ECX |
0x448f MOV 0x30(%RSP),%R9 |
0x4494 TEST %R9,%R9 |
0x4497 JE 44f0 |
0x4499 MOV 0x18(%RSP),%R13 |
0x449e MOV %R13,%RDI |
0x44a1 IMUL %RCX,%RDI |
0x44a5 ADD %RAX,%RDI |
0x44a8 MOV %RCX,%R11 |
0x44ab MOV 0x58(%RSP),%RDX |
(51) 0x44b0 LEA (%R10,%R11,1),%R15D |
(51) 0x44b4 VMOVSS (%RDX,%R15,4),%XMM2 |
(51) 0x44ba VCVTSS2SD %XMM2,%XMM2,%XMM2 |
(51) 0x44be MOV %EDI,%R15D |
(51) 0x44c1 VMOVSS (%RBX,%R15,4),%XMM3 |
(51) 0x44c7 VCVTSS2SD %XMM3,%XMM3,%XMM3 |
(51) 0x44cb VFMADD231SD %XMM3,%XMM2,%XMM1 |
(51) 0x44d0 INC %R11 |
(51) 0x44d3 ADD %R13,%RDI |
(51) 0x44d6 DEC %R9 |
(51) 0x44d9 JNE 44b0 |
0x44db MOV 0x28(%RSP),%RSI |
0x44e0 SUB %RSI,%RCX |
0x44e3 CMP $-0x4,%RCX |
0x44e7 JA 4440 |
0x44ed JMP 450a |
0x44f0 MOV %RCX,%R11 |
0x44f3 MOV 0x58(%RSP),%RDX |
0x44f8 MOV 0x28(%RSP),%RSI |
0x44fd SUB %RSI,%RCX |
0x4500 CMP $-0x4,%RCX |
0x4504 JA 4440 |
0x450a LEA 0x3(%R11),%R13 |
0x450e MOV 0x18(%RSP),%R15 |
0x4513 IMUL %R15,%R13 |
0x4517 ADD %RAX,%R13 |
0x451a LEA 0x2(%R11),%RCX |
0x451e IMUL %R15,%RCX |
0x4522 ADD %RAX,%RCX |
0x4525 MOV %R15,%R9 |
0x4528 IMUL %R11,%R9 |
0x452c ADD %RAX,%R9 |
0x452f LEA 0x1(%R11),%RDI |
0x4533 IMUL %R15,%RDI |
0x4537 ADD %RAX,%RDI |
0x453a NOPW (%RAX,%RAX,1) |
(50) 0x4540 LEA (%R10,%R11,1),%R15D |
(50) 0x4544 VMOVSS (%RDX,%R15,4),%XMM2 |
(50) 0x454a VCVTSS2SD %XMM2,%XMM2,%XMM2 |
(50) 0x454e MOV %R9D,%R15D |
(50) 0x4551 VMOVSS (%RBX,%R15,4),%XMM3 |
(50) 0x4557 VCVTSS2SD %XMM3,%XMM3,%XMM3 |
(50) 0x455b LEA 0x1(%R10,%R11,1),%R15D |
(50) 0x4560 VMOVSS (%RDX,%R15,4),%XMM4 |
(50) 0x4566 VCVTSS2SD %XMM4,%XMM4,%XMM4 |
(50) 0x456a MOV %EDI,%R15D |
(50) 0x456d VMOVSS (%RBX,%R15,4),%XMM5 |
(50) 0x4573 VCVTSS2SD %XMM5,%XMM5,%XMM5 |
(50) 0x4577 VFMADD213SD %XMM1,%XMM2,%XMM3 |
(50) 0x457c LEA 0x2(%R10,%R11,1),%R15D |
(50) 0x4581 VMOVSS (%RDX,%R15,4),%XMM1 |
(50) 0x4587 VCVTSS2SD %XMM1,%XMM1,%XMM1 |
(50) 0x458b VFMADD213SD %XMM3,%XMM4,%XMM5 |
(50) 0x4590 MOV %ECX,%R15D |
(50) 0x4593 VMOVSS (%RBX,%R15,4),%XMM2 |
(50) 0x4599 VCVTSS2SD %XMM2,%XMM2,%XMM2 |
(50) 0x459d VFMADD213SD %XMM5,%XMM1,%XMM2 |
(50) 0x45a2 LEA 0x3(%R10,%R11,1),%R15D |
(50) 0x45a7 VMOVSS (%RDX,%R15,4),%XMM1 |
(50) 0x45ad VCVTSS2SD %XMM1,%XMM1,%XMM3 |
(50) 0x45b1 MOV %R13D,%R15D |
(50) 0x45b4 VMOVSS (%RBX,%R15,4),%XMM1 |
(50) 0x45ba VCVTSS2SD %XMM1,%XMM1,%XMM1 |
(50) 0x45be VFMADD213SD %XMM2,%XMM3,%XMM1 |
(50) 0x45c3 ADD $0x4,%R11 |
(50) 0x45c7 ADD %R8,%R13 |
(50) 0x45ca ADD %R8,%RCX |
(50) 0x45cd ADD %R8,%R9 |
(50) 0x45d0 ADD %R8,%RDI |
(50) 0x45d3 CMP %R11,%RSI |
(50) 0x45d6 JNE 4540 |
0x45dc JMP 4440 |
0x45f0 CMP $0x20,%ESI |
0x45f3 JAE 4601 |
0x45f5 XOR %EDI,%EDI |
0x45f7 MOV 0x58(%RSP),%RDX |
0x45fc JMP 46d6 |
0x4601 VXORPS %XMM1,%XMM1,%XMM1 |
0x4605 XOR %ECX,%ECX |
0x4607 VPXOR %XMM2,%XMM2,%XMM2 |
0x460b VPXOR %XMM3,%XMM3,%XMM3 |
0x460f VPXOR %XMM4,%XMM4,%XMM4 |
0x4613 MOV 0x58(%RSP),%RDX |
0x4618 MOV 0x120(%RSP),%R9 |
(52) 0x4620 LEA (%R10,%RCX,1),%EDI |
(52) 0x4624 VCVTPS2PD (%RDX,%RDI,4),%ZMM5 |
(52) 0x462b VCVTPS2PD 0x20(%RDX,%RDI,4),%ZMM6 |
(52) 0x4633 VCVTPS2PD 0x40(%RDX,%RDI,4),%ZMM7 |
(52) 0x463b VCVTPS2PD 0x60(%RDX,%RDI,4),%ZMM8 |
(52) 0x4643 LEA (%RAX,%RCX,1),%EDI |
(52) 0x4646 VCVTPS2PD (%RBX,%RDI,4),%ZMM9 |
(52) 0x464d VFMADD231PD %ZMM9,%ZMM5,%ZMM1 |
(52) 0x4653 VCVTPS2PD 0x20(%RBX,%RDI,4),%ZMM5 |
(52) 0x465b VFMADD231PD %ZMM5,%ZMM6,%ZMM2 |
(52) 0x4661 VCVTPS2PD 0x40(%RBX,%RDI,4),%ZMM5 |
(52) 0x4669 VFMADD231PD %ZMM5,%ZMM7,%ZMM3 |
(52) 0x466f VCVTPS2PD 0x60(%RBX,%RDI,4),%ZMM5 |
(52) 0x4677 VFMADD231PD %ZMM5,%ZMM8,%ZMM4 |
(52) 0x467d ADD $0x20,%RCX |
(52) 0x4681 CMP %RCX,%R9 |
(52) 0x4684 JNE 4620 |
0x4686 VADDPD %ZMM1,%ZMM2,%ZMM1 |
0x468c VADDPD %ZMM3,%ZMM4,%ZMM2 |
0x4692 VADDPD %ZMM1,%ZMM2,%ZMM1 |
0x4698 VEXTRACTF64X4 $0x1,%ZMM1,%YMM2 |
0x469f VADDPD %ZMM2,%ZMM1,%ZMM1 |
0x46a5 VEXTRACTF128 $0x1,%YMM1,%XMM2 |
0x46ab VADDPD %XMM2,%XMM1,%XMM1 |
0x46af VSHUFPD $0x1,%XMM1,%XMM1,%XMM2 |
0x46b4 VADDSD %XMM2,%XMM1,%XMM1 |
0x46b8 CMP %R9D,%ESI |
0x46bb JE 4440 |
0x46c1 MOV %R9,%RDI |
0x46c4 MOV %R9,%RCX |
0x46c7 TEST $0x1c,%R12B |
0x46cb MOV 0x58(%RSP),%RDX |
0x46d0 JE 448f |
0x46d6 VMOVQ %XMM1,%XMM1 |
0x46da MOV 0xe0(%RSP),%RSI |
0x46e2 NOPW %CS:(%RAX,%RAX,1) |
(53) 0x46f0 LEA (%R10,%RDI,1),%ECX |
(53) 0x46f4 VCVTPS2PD (%RDX,%RCX,4),%YMM2 |
(53) 0x46f9 LEA (%RAX,%RDI,1),%ECX |
(53) 0x46fc VCVTPS2PD (%RBX,%RCX,4),%YMM3 |
(53) 0x4701 VFMADD231PD %YMM3,%YMM2,%YMM1 |
(53) 0x4706 ADD $0x4,%RDI |
(53) 0x470a CMP %RDI,%RSI |
(53) 0x470d JNE 46f0 |
0x470f VEXTRACTF128 $0x1,%YMM1,%XMM2 |
0x4715 VADDPD %XMM2,%XMM1,%XMM1 |
0x4719 VSHUFPD $0x1,%XMM1,%XMM1,%XMM2 |
0x471e VADDSD %XMM2,%XMM1,%XMM1 |
0x4722 MOV %RSI,%RCX |
0x4725 CMP %ESI,0x28(%RSP) |
0x4729 JE 4440 |
0x472f JMP 448f |
/home/eoseret/Applications/llm-attention/attention.cpp: 27 - 33 |
-------------------------------------------------------------------------------- |
27: for (unsigned int j = 0; j < N; ++j) { //vectorized |
28: double sum = 0.0; |
29: #pragma clang loop vectorize(enable) |
30: for (unsigned int k = 0; k < K; ++k) { //vectorized |
31: sum += (double)A[i * K + k] * (double)B[k * N + j]; |
32: } |
33: C[i * N + j] = alpha * static_cast<float>(sum); |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | __libc_init_first | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | attention-avx512 |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 2.88 |
| CQA speedup if FP arith vectorized | 1.05 |
| CQA speedup if fully vectorized | 1.48 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.58 |
| Bottlenecks | micro-operation queue, |
| Function | main |
| Source | attention.cpp:27-27,attention.cpp:30-30,attention.cpp:33-33 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 23.75 |
| CQA cycles if no scalar integer | 8.25 |
| CQA cycles if FP arith vectorized | 22.69 |
| CQA cycles if fully vectorized | 16.00 |
| Front-end cycles | 23.75 |
| P0 cycles | 15.00 |
| P1 cycles | 15.00 |
| P2 cycles | 10.00 |
| P3 cycles | 10.00 |
| P4 cycles | 1.00 |
| P5 cycles | 15.00 |
| P6 cycles | 15.00 |
| P7 cycles | 1.00 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 94.00 |
| Nb uops | 95.00 |
| Nb loads | 20.00 |
| Nb stores | 1.00 |
| Nb stack references | 12.00 |
| FLOP/cycle | 1.64 |
| Nb FLOP add-sub | 38.00 |
| Nb FLOP mul | 1.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 6.27 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 145.00 |
| Bytes stored | 4.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 32.00 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | 50.00 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 43.48 |
| Vector-efficiency ratio all | 22.25 |
| Vector-efficiency ratio load | 12.50 |
| Vector-efficiency ratio store | 6.25 |
| Vector-efficiency ratio mul | 11.25 |
| Vector-efficiency ratio add_sub | 43.75 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 17.93 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 2.88 |
| CQA speedup if FP arith vectorized | 1.05 |
| CQA speedup if fully vectorized | 1.48 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.58 |
| Bottlenecks | micro-operation queue, |
| Function | main |
| Source | attention.cpp:27-27,attention.cpp:30-30,attention.cpp:33-33 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 23.75 |
| CQA cycles if no scalar integer | 8.25 |
| CQA cycles if FP arith vectorized | 22.69 |
| CQA cycles if fully vectorized | 16.00 |
| Front-end cycles | 23.75 |
| P0 cycles | 15.00 |
| P1 cycles | 15.00 |
| P2 cycles | 10.00 |
| P3 cycles | 10.00 |
| P4 cycles | 1.00 |
| P5 cycles | 15.00 |
| P6 cycles | 15.00 |
| P7 cycles | 1.00 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 94.00 |
| Nb uops | 95.00 |
| Nb loads | 20.00 |
| Nb stores | 1.00 |
| Nb stack references | 12.00 |
| FLOP/cycle | 1.64 |
| Nb FLOP add-sub | 38.00 |
| Nb FLOP mul | 1.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 6.27 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 145.00 |
| Bytes stored | 4.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 32.00 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | 50.00 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 43.48 |
| Vector-efficiency ratio all | 22.25 |
| Vector-efficiency ratio load | 12.50 |
| Vector-efficiency ratio store | 6.25 |
| Vector-efficiency ratio mul | 11.25 |
| Vector-efficiency ratio add_sub | 43.75 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 17.93 |
| Path / |
| Function | main |
| Source file and lines | attention.cpp:27-33 |
| Module | attention-avx512 |
| nb instructions | 94 |
| nb uops | 95 |
| loop length | 408 |
| used x86 registers | 11 |
| used mmx registers | 0 |
| used xmm registers | 5 |
| used ymm registers | 2 |
| used zmm registers | 4 |
| nb stack references | 12 |
| ADD-SUB / MUL ratio | 8.00 |
| micro-operation queue | 23.75 cycles |
| front end | 23.75 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
|---|---|---|---|---|---|---|---|---|
| uops | 15.00 | 15.00 | 10.00 | 10.00 | 1.00 | 15.00 | 15.00 | 1.00 |
| cycles | 15.00 | 15.00 | 10.00 | 10.00 | 1.00 | 15.00 | 15.00 | 1.00 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 23.75 |
| Dispatch | 15.00 |
| Overall L1 | 23.75 |
| all | 9% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 0% |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 20% |
| all | 72% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | 0% |
| mul | 0% |
| add-sub | 75% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 87% |
| all | 32% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | 50% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 43% |
| all | 12% |
| load | 12% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 12% |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 13% |
| all | 38% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | 6% |
| mul | 6% |
| add-sub | 59% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 26% |
| all | 22% |
| load | 12% |
| store | 6% |
| mul | 11% |
| add-sub | 43% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 17% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| VCVTSD2SS %XMM1,%XMM1,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 5 | 1 | scal (12.5%) |
| VMULSS %XMM0,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| MOV 0x10(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| ADD %EAX,%ECX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0x50(%RSP),%R15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VMOVSS %XMM1,(%R15,%RCX,4) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (6.3%) |
| INC %RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| CMP 0x20(%RSP),%RAX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| MOV 0x48(%RSP),%R13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| JE 43f0 <main+0x1e00> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| VXORPS %XMM1,%XMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| MOV 0x28(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| CMP $0x4,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| JB 448d <main+0x1e9d> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV %EAX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| ADD 0x80(%RSP),%ECX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| SETB %CL | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| OR 0x40(%RSP),%CL | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| JE 45f0 <main+0x2000> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV 0x30(%RSP),%R9 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| TEST %R9,%R9 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| JE 44f0 <main+0x1f00> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV 0x18(%RSP),%R13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| MOV %R13,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| IMUL %RCX,%RDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (12.5%) |
| ADD %RAX,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RCX,%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV 0x58(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| MOV 0x28(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| SUB %RSI,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| CMP $-0x4,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| JA 4440 <main+0x1e50> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| JMP 450a <main+0x1f1a> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| MOV %RCX,%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV 0x58(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| MOV 0x28(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| SUB %RSI,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| CMP $-0x4,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| JA 4440 <main+0x1e50> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| LEA 0x3(%R11),%R13 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| MOV 0x18(%RSP),%R15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| IMUL %R15,%R13 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (12.5%) |
| ADD %RAX,%R13 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| LEA 0x2(%R11),%RCX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| IMUL %R15,%RCX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| ADD %RAX,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %R15,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| IMUL %R11,%R9 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (12.5%) |
| ADD %RAX,%R9 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| LEA 0x1(%R11),%RDI | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| IMUL %R15,%RDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (12.5%) |
| ADD %RAX,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| JMP 4440 <main+0x1e50> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| CMP $0x20,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| JAE 4601 <main+0x2011> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| XOR %EDI,%EDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| MOV 0x58(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| JMP 46d6 <main+0x20e6> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| VXORPS %XMM1,%XMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VPXOR %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| VPXOR %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| VPXOR %XMM4,%XMM4,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| MOV 0x58(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| MOV 0x120(%RSP),%R9 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| VADDPD %ZMM1,%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VADDPD %ZMM3,%ZMM4,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VADDPD %ZMM1,%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VEXTRACTF64X4 $0x1,%ZMM1,%YMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (50.0%) |
| VADDPD %ZMM2,%ZMM1,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VEXTRACTF128 $0x1,%YMM1,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPD %XMM2,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM1,%XMM1,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (25.0%) |
| VADDSD %XMM2,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| CMP %R9D,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| JE 4440 <main+0x1e50> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV %R9,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| MOV %R9,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| TEST $0x1c,%R12B | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0x58(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| JE 448f <main+0x1e9f> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| VMOVQ %XMM1,%XMM1 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| MOV 0xe0(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VEXTRACTF128 $0x1,%YMM1,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPD %XMM2,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM1,%XMM1,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (25.0%) |
| VADDSD %XMM2,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| MOV %RSI,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| CMP %ESI,0x28(%RSP) | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| JE 4440 <main+0x1e50> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| JMP 448f <main+0x1e9f> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| Function | main |
| Source file and lines | attention.cpp:27-33 |
| Module | attention-avx512 |
| nb instructions | 94 |
| nb uops | 95 |
| loop length | 408 |
| used x86 registers | 11 |
| used mmx registers | 0 |
| used xmm registers | 5 |
| used ymm registers | 2 |
| used zmm registers | 4 |
| nb stack references | 12 |
| ADD-SUB / MUL ratio | 8.00 |
| micro-operation queue | 23.75 cycles |
| front end | 23.75 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
|---|---|---|---|---|---|---|---|---|
| uops | 15.00 | 15.00 | 10.00 | 10.00 | 1.00 | 15.00 | 15.00 | 1.00 |
| cycles | 15.00 | 15.00 | 10.00 | 10.00 | 1.00 | 15.00 | 15.00 | 1.00 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 23.75 |
| Dispatch | 15.00 |
| Overall L1 | 23.75 |
| all | 9% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 0% |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 20% |
| all | 72% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | 0% |
| mul | 0% |
| add-sub | 75% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 87% |
| all | 32% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | 50% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 43% |
| all | 12% |
| load | 12% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 12% |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 13% |
| all | 38% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | 6% |
| mul | 6% |
| add-sub | 59% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 26% |
| all | 22% |
| load | 12% |
| store | 6% |
| mul | 11% |
| add-sub | 43% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 17% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| VCVTSD2SS %XMM1,%XMM1,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 5 | 1 | scal (12.5%) |
| VMULSS %XMM0,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| MOV 0x10(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| ADD %EAX,%ECX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0x50(%RSP),%R15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VMOVSS %XMM1,(%R15,%RCX,4) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (6.3%) |
| INC %RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| CMP 0x20(%RSP),%RAX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| MOV 0x48(%RSP),%R13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| JE 43f0 <main+0x1e00> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| VXORPS %XMM1,%XMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| MOV 0x28(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| CMP $0x4,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| JB 448d <main+0x1e9d> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV %EAX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| ADD 0x80(%RSP),%ECX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| SETB %CL | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| OR 0x40(%RSP),%CL | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| JE 45f0 <main+0x2000> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV 0x30(%RSP),%R9 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| TEST %R9,%R9 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| JE 44f0 <main+0x1f00> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV 0x18(%RSP),%R13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| MOV %R13,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| IMUL %RCX,%RDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (12.5%) |
| ADD %RAX,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RCX,%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV 0x58(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| MOV 0x28(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| SUB %RSI,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| CMP $-0x4,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| JA 4440 <main+0x1e50> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| JMP 450a <main+0x1f1a> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| MOV %RCX,%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV 0x58(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| MOV 0x28(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| SUB %RSI,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| CMP $-0x4,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| JA 4440 <main+0x1e50> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| LEA 0x3(%R11),%R13 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| MOV 0x18(%RSP),%R15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| IMUL %R15,%R13 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (12.5%) |
| ADD %RAX,%R13 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| LEA 0x2(%R11),%RCX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| IMUL %R15,%RCX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| ADD %RAX,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %R15,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| IMUL %R11,%R9 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (12.5%) |
| ADD %RAX,%R9 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| LEA 0x1(%R11),%RDI | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| IMUL %R15,%RDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (12.5%) |
| ADD %RAX,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| JMP 4440 <main+0x1e50> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| CMP $0x20,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| JAE 4601 <main+0x2011> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| XOR %EDI,%EDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| MOV 0x58(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| JMP 46d6 <main+0x20e6> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| VXORPS %XMM1,%XMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VPXOR %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| VPXOR %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| VPXOR %XMM4,%XMM4,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| MOV 0x58(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| MOV 0x120(%RSP),%R9 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| VADDPD %ZMM1,%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VADDPD %ZMM3,%ZMM4,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VADDPD %ZMM1,%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VEXTRACTF64X4 $0x1,%ZMM1,%YMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (50.0%) |
| VADDPD %ZMM2,%ZMM1,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VEXTRACTF128 $0x1,%YMM1,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPD %XMM2,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM1,%XMM1,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (25.0%) |
| VADDSD %XMM2,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| CMP %R9D,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| JE 4440 <main+0x1e50> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV %R9,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| MOV %R9,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| TEST $0x1c,%R12B | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0x58(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| JE 448f <main+0x1e9f> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| VMOVQ %XMM1,%XMM1 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| MOV 0xe0(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VEXTRACTF128 $0x1,%YMM1,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPD %XMM2,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM1,%XMM1,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (25.0%) |
| VADDSD %XMM2,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| MOV %RSI,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| CMP %ESI,0x28(%RSP) | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| JE 4440 <main+0x1e50> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| JMP 448f <main+0x1e9f> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
