| Loop Id: 48 | Module: attention-avx512 | Source: attention.cpp:26-262 [...] | Coverage: 0.03% |
|---|
| Loop Id: 48 | Module: attention-avx512 | Source: attention.cpp:26-262 [...] | Coverage: 0.03% |
|---|
0x43f0 MOV 0x200(%RSP),%EDX |
0x43f7 INC %EDX |
0x43f9 ADD %R12,%R10 |
0x43fc MOV 0x18(%RSP),%RCX |
0x4401 CMP %ECX,%EDX |
0x4403 JE 43c0 |
0x4405 MOV %EDX,%EAX |
0x4407 IMUL %R12D,%EAX |
0x440b MOV %EDX,0x200(%RSP) |
0x4412 IMUL %ECX,%EDX |
0x4415 MOV %RDX,0x10(%RSP) |
0x441a ADD 0x80(%RSP),%EAX |
0x4421 SETB %AL |
0x4424 OR 0x1e0(%RSP),%AL |
0x442b MOV %AL,0x40(%RSP) |
0x442f XOR %EAX,%EAX |
0x4431 JMP 4469 |
(49) 0x4440 VCVTSD2SS %XMM1,%XMM1,%XMM1 |
(49) 0x4444 VMULSS %XMM0,%XMM1,%XMM1 |
(49) 0x4448 MOV 0x10(%RSP),%RCX |
(49) 0x444d ADD %EAX,%ECX |
(49) 0x444f MOV 0x50(%RSP),%R15 |
(49) 0x4454 VMOVSS %XMM1,(%R15,%RCX,4) |
(49) 0x445a INC %RAX |
(49) 0x445d CMP 0x20(%RSP),%RAX |
(49) 0x4462 MOV 0x48(%RSP),%R13 |
(49) 0x4467 JE 43f0 |
(49) 0x4469 VXORPS %XMM1,%XMM1,%XMM1 |
(49) 0x446d MOV 0x28(%RSP),%RSI |
(49) 0x4472 CMP $0x4,%ESI |
(49) 0x4475 JB 448d |
(49) 0x4477 MOV %EAX,%ECX |
(49) 0x4479 ADD 0x80(%RSP),%ECX |
(49) 0x4480 SETB %CL |
(49) 0x4483 OR 0x40(%RSP),%CL |
(49) 0x4487 JE 45f0 |
(49) 0x448d XOR %ECX,%ECX |
(49) 0x448f MOV 0x30(%RSP),%R9 |
(49) 0x4494 TEST %R9,%R9 |
(49) 0x4497 JE 44f0 |
(49) 0x4499 MOV 0x18(%RSP),%R13 |
(49) 0x449e MOV %R13,%RDI |
(49) 0x44a1 IMUL %RCX,%RDI |
(49) 0x44a5 ADD %RAX,%RDI |
(49) 0x44a8 MOV %RCX,%R11 |
(49) 0x44ab MOV 0x58(%RSP),%RDX |
(51) 0x44b0 LEA (%R10,%R11,1),%R15D |
(51) 0x44b4 VMOVSS (%RDX,%R15,4),%XMM2 |
(51) 0x44ba VCVTSS2SD %XMM2,%XMM2,%XMM2 |
(51) 0x44be MOV %EDI,%R15D |
(51) 0x44c1 VMOVSS (%RBX,%R15,4),%XMM3 |
(51) 0x44c7 VCVTSS2SD %XMM3,%XMM3,%XMM3 |
(51) 0x44cb VFMADD231SD %XMM3,%XMM2,%XMM1 |
(51) 0x44d0 INC %R11 |
(51) 0x44d3 ADD %R13,%RDI |
(51) 0x44d6 DEC %R9 |
(51) 0x44d9 JNE 44b0 |
(49) 0x44db MOV 0x28(%RSP),%RSI |
(49) 0x44e0 SUB %RSI,%RCX |
(49) 0x44e3 CMP $-0x4,%RCX |
(49) 0x44e7 JA 4440 |
(49) 0x44ed JMP 450a |
(49) 0x44f0 MOV %RCX,%R11 |
(49) 0x44f3 MOV 0x58(%RSP),%RDX |
(49) 0x44f8 MOV 0x28(%RSP),%RSI |
(49) 0x44fd SUB %RSI,%RCX |
(49) 0x4500 CMP $-0x4,%RCX |
(49) 0x4504 JA 4440 |
(49) 0x450a LEA 0x3(%R11),%R13 |
(49) 0x450e MOV 0x18(%RSP),%R15 |
(49) 0x4513 IMUL %R15,%R13 |
(49) 0x4517 ADD %RAX,%R13 |
(49) 0x451a LEA 0x2(%R11),%RCX |
(49) 0x451e IMUL %R15,%RCX |
(49) 0x4522 ADD %RAX,%RCX |
(49) 0x4525 MOV %R15,%R9 |
(49) 0x4528 IMUL %R11,%R9 |
(49) 0x452c ADD %RAX,%R9 |
(49) 0x452f LEA 0x1(%R11),%RDI |
(49) 0x4533 IMUL %R15,%RDI |
(49) 0x4537 ADD %RAX,%RDI |
(49) 0x453a NOPW (%RAX,%RAX,1) |
(50) 0x4540 LEA (%R10,%R11,1),%R15D |
(50) 0x4544 VMOVSS (%RDX,%R15,4),%XMM2 |
(50) 0x454a VCVTSS2SD %XMM2,%XMM2,%XMM2 |
(50) 0x454e MOV %R9D,%R15D |
(50) 0x4551 VMOVSS (%RBX,%R15,4),%XMM3 |
(50) 0x4557 VCVTSS2SD %XMM3,%XMM3,%XMM3 |
(50) 0x455b LEA 0x1(%R10,%R11,1),%R15D |
(50) 0x4560 VMOVSS (%RDX,%R15,4),%XMM4 |
(50) 0x4566 VCVTSS2SD %XMM4,%XMM4,%XMM4 |
(50) 0x456a MOV %EDI,%R15D |
(50) 0x456d VMOVSS (%RBX,%R15,4),%XMM5 |
(50) 0x4573 VCVTSS2SD %XMM5,%XMM5,%XMM5 |
(50) 0x4577 VFMADD213SD %XMM1,%XMM2,%XMM3 |
(50) 0x457c LEA 0x2(%R10,%R11,1),%R15D |
(50) 0x4581 VMOVSS (%RDX,%R15,4),%XMM1 |
(50) 0x4587 VCVTSS2SD %XMM1,%XMM1,%XMM1 |
(50) 0x458b VFMADD213SD %XMM3,%XMM4,%XMM5 |
(50) 0x4590 MOV %ECX,%R15D |
(50) 0x4593 VMOVSS (%RBX,%R15,4),%XMM2 |
(50) 0x4599 VCVTSS2SD %XMM2,%XMM2,%XMM2 |
(50) 0x459d VFMADD213SD %XMM5,%XMM1,%XMM2 |
(50) 0x45a2 LEA 0x3(%R10,%R11,1),%R15D |
(50) 0x45a7 VMOVSS (%RDX,%R15,4),%XMM1 |
(50) 0x45ad VCVTSS2SD %XMM1,%XMM1,%XMM3 |
(50) 0x45b1 MOV %R13D,%R15D |
(50) 0x45b4 VMOVSS (%RBX,%R15,4),%XMM1 |
(50) 0x45ba VCVTSS2SD %XMM1,%XMM1,%XMM1 |
(50) 0x45be VFMADD213SD %XMM2,%XMM3,%XMM1 |
(50) 0x45c3 ADD $0x4,%R11 |
(50) 0x45c7 ADD %R8,%R13 |
(50) 0x45ca ADD %R8,%RCX |
(50) 0x45cd ADD %R8,%R9 |
(50) 0x45d0 ADD %R8,%RDI |
(50) 0x45d3 CMP %R11,%RSI |
(50) 0x45d6 JNE 4540 |
(49) 0x45dc JMP 4440 |
(49) 0x45f0 CMP $0x20,%ESI |
(49) 0x45f3 JAE 4601 |
(49) 0x45f5 XOR %EDI,%EDI |
(49) 0x45f7 MOV 0x58(%RSP),%RDX |
(49) 0x45fc JMP 46d6 |
(49) 0x4601 VXORPS %XMM1,%XMM1,%XMM1 |
(49) 0x4605 XOR %ECX,%ECX |
(49) 0x4607 VPXOR %XMM2,%XMM2,%XMM2 |
(49) 0x460b VPXOR %XMM3,%XMM3,%XMM3 |
(49) 0x460f VPXOR %XMM4,%XMM4,%XMM4 |
(49) 0x4613 MOV 0x58(%RSP),%RDX |
(49) 0x4618 MOV 0x120(%RSP),%R9 |
(52) 0x4620 LEA (%R10,%RCX,1),%EDI |
(52) 0x4624 VCVTPS2PD (%RDX,%RDI,4),%ZMM5 |
(52) 0x462b VCVTPS2PD 0x20(%RDX,%RDI,4),%ZMM6 |
(52) 0x4633 VCVTPS2PD 0x40(%RDX,%RDI,4),%ZMM7 |
(52) 0x463b VCVTPS2PD 0x60(%RDX,%RDI,4),%ZMM8 |
(52) 0x4643 LEA (%RAX,%RCX,1),%EDI |
(52) 0x4646 VCVTPS2PD (%RBX,%RDI,4),%ZMM9 |
(52) 0x464d VFMADD231PD %ZMM9,%ZMM5,%ZMM1 |
(52) 0x4653 VCVTPS2PD 0x20(%RBX,%RDI,4),%ZMM5 |
(52) 0x465b VFMADD231PD %ZMM5,%ZMM6,%ZMM2 |
(52) 0x4661 VCVTPS2PD 0x40(%RBX,%RDI,4),%ZMM5 |
(52) 0x4669 VFMADD231PD %ZMM5,%ZMM7,%ZMM3 |
(52) 0x466f VCVTPS2PD 0x60(%RBX,%RDI,4),%ZMM5 |
(52) 0x4677 VFMADD231PD %ZMM5,%ZMM8,%ZMM4 |
(52) 0x467d ADD $0x20,%RCX |
(52) 0x4681 CMP %RCX,%R9 |
(52) 0x4684 JNE 4620 |
(49) 0x4686 VADDPD %ZMM1,%ZMM2,%ZMM1 |
(49) 0x468c VADDPD %ZMM3,%ZMM4,%ZMM2 |
(49) 0x4692 VADDPD %ZMM1,%ZMM2,%ZMM1 |
(49) 0x4698 VEXTRACTF64X4 $0x1,%ZMM1,%YMM2 |
(49) 0x469f VADDPD %ZMM2,%ZMM1,%ZMM1 |
(49) 0x46a5 VEXTRACTF128 $0x1,%YMM1,%XMM2 |
(49) 0x46ab VADDPD %XMM2,%XMM1,%XMM1 |
(49) 0x46af VSHUFPD $0x1,%XMM1,%XMM1,%XMM2 |
(49) 0x46b4 VADDSD %XMM2,%XMM1,%XMM1 |
(49) 0x46b8 CMP %R9D,%ESI |
(49) 0x46bb JE 4440 |
(49) 0x46c1 MOV %R9,%RDI |
(49) 0x46c4 MOV %R9,%RCX |
(49) 0x46c7 TEST $0x1c,%R12B |
(49) 0x46cb MOV 0x58(%RSP),%RDX |
(49) 0x46d0 JE 448f |
(49) 0x46d6 VMOVQ %XMM1,%XMM1 |
(49) 0x46da MOV 0xe0(%RSP),%RSI |
(49) 0x46e2 NOPW %CS:(%RAX,%RAX,1) |
(53) 0x46f0 LEA (%R10,%RDI,1),%ECX |
(53) 0x46f4 VCVTPS2PD (%RDX,%RCX,4),%YMM2 |
(53) 0x46f9 LEA (%RAX,%RDI,1),%ECX |
(53) 0x46fc VCVTPS2PD (%RBX,%RCX,4),%YMM3 |
(53) 0x4701 VFMADD231PD %YMM3,%YMM2,%YMM1 |
(53) 0x4706 ADD $0x4,%RDI |
(53) 0x470a CMP %RDI,%RSI |
(53) 0x470d JNE 46f0 |
(49) 0x470f VEXTRACTF128 $0x1,%YMM1,%XMM2 |
(49) 0x4715 VADDPD %XMM2,%XMM1,%XMM1 |
(49) 0x4719 VSHUFPD $0x1,%XMM1,%XMM1,%XMM2 |
(49) 0x471e VADDSD %XMM2,%XMM1,%XMM1 |
(49) 0x4722 MOV %RSI,%RCX |
(49) 0x4725 CMP %ESI,0x28(%RSP) |
(49) 0x4729 JE 4440 |
(49) 0x472f JMP 448f |
/home/eoseret/Applications/llm-attention/attention.cpp: 26 - 262 |
-------------------------------------------------------------------------------- |
26: for (unsigned int i = 0; i < M; ++i) { |
27: for (unsigned int j = 0; j < N; ++j) { //vectorized |
28: double sum = 0.0; |
29: #pragma clang loop vectorize(enable) |
30: for (unsigned int k = 0; k < K; ++k) { //vectorized |
31: sum += (double)A[i * K + k] * (double)B[k * N + j]; |
32: } |
33: C[i * N + j] = alpha * static_cast<float>(sum); |
[...] |
262: for (size_t r = 0; r < rept; r++) |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | __libc_init_first | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | attention-avx512 |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.00 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 13.60 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.31 |
| Bottlenecks | micro-operation queue, |
| Function | main |
| Source | attention.cpp:26-26,attention.cpp:262-262 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 4.25 |
| CQA cycles if no scalar integer | 4.25 |
| CQA cycles if FP arith vectorized | 4.25 |
| CQA cycles if fully vectorized | 0.31 |
| Front-end cycles | 4.25 |
| P0 cycles | 2.50 |
| P1 cycles | 3.25 |
| P2 cycles | 2.33 |
| P3 cycles | 2.33 |
| P4 cycles | 3.00 |
| P5 cycles | 2.50 |
| P6 cycles | 2.50 |
| P7 cycles | 2.33 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 17.00 |
| Nb uops | 17.00 |
| Nb loads | 4.00 |
| Nb stores | 3.00 |
| Nb stack references | 6.00 |
| FLOP/cycle | 0.00 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 7.06 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 17.00 |
| Bytes stored | 13.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 0.00 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | 0.00 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 0.00 |
| Vector-efficiency ratio all | 8.59 |
| Vector-efficiency ratio load | 9.38 |
| Vector-efficiency ratio store | 9.38 |
| Vector-efficiency ratio mul | 6.25 |
| Vector-efficiency ratio add_sub | 9.38 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 6.25 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.00 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 13.60 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.31 |
| Bottlenecks | micro-operation queue, |
| Function | main |
| Source | attention.cpp:26-26,attention.cpp:262-262 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 4.25 |
| CQA cycles if no scalar integer | 4.25 |
| CQA cycles if FP arith vectorized | 4.25 |
| CQA cycles if fully vectorized | 0.31 |
| Front-end cycles | 4.25 |
| P0 cycles | 2.50 |
| P1 cycles | 3.25 |
| P2 cycles | 2.33 |
| P3 cycles | 2.33 |
| P4 cycles | 3.00 |
| P5 cycles | 2.50 |
| P6 cycles | 2.50 |
| P7 cycles | 2.33 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 17.00 |
| Nb uops | 17.00 |
| Nb loads | 4.00 |
| Nb stores | 3.00 |
| Nb stack references | 6.00 |
| FLOP/cycle | 0.00 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 7.06 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 17.00 |
| Bytes stored | 13.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 0.00 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | 0.00 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 0.00 |
| Vector-efficiency ratio all | 8.59 |
| Vector-efficiency ratio load | 9.38 |
| Vector-efficiency ratio store | 9.38 |
| Vector-efficiency ratio mul | 6.25 |
| Vector-efficiency ratio add_sub | 9.38 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 6.25 |
| Path / |
| Function | main |
| Source file and lines | attention.cpp:26-262 |
| Module | attention-avx512 |
| nb instructions | 17 |
| nb uops | 17 |
| loop length | 67 |
| used x86 registers | 6 |
| used mmx registers | 0 |
| used xmm registers | 0 |
| used ymm registers | 0 |
| used zmm registers | 0 |
| nb stack references | 6 |
| micro-operation queue | 4.25 cycles |
| front end | 4.25 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
|---|---|---|---|---|---|---|---|---|
| uops | 2.50 | 2.50 | 2.33 | 2.33 | 3.00 | 2.50 | 2.50 | 2.33 |
| cycles | 2.50 | 3.25 | 2.33 | 2.33 | 3.00 | 2.50 | 2.50 | 2.33 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 4.25 |
| Dispatch | 3.25 |
| Overall L1 | 4.25 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 8% |
| load | 9% |
| store | 9% |
| mul | 6% |
| add-sub | 9% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 6% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOV 0x200(%RSP),%EDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (6.3%) |
| INC %EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| ADD %R12,%R10 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0x18(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| CMP %ECX,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| JE 43c0 <main+0x1dd0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV %EDX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| IMUL %R12D,%EAX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| MOV %EDX,0x200(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (6.3%) |
| IMUL %ECX,%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (6.3%) |
| MOV %RDX,0x10(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| ADD 0x80(%RSP),%EAX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| SETB %AL | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| OR 0x1e0(%RSP),%AL | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| MOV %AL,0x40(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | N/A |
| XOR %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| JMP 4469 <main+0x1e79> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| Function | main |
| Source file and lines | attention.cpp:26-262 |
| Module | attention-avx512 |
| nb instructions | 17 |
| nb uops | 17 |
| loop length | 67 |
| used x86 registers | 6 |
| used mmx registers | 0 |
| used xmm registers | 0 |
| used ymm registers | 0 |
| used zmm registers | 0 |
| nb stack references | 6 |
| micro-operation queue | 4.25 cycles |
| front end | 4.25 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
|---|---|---|---|---|---|---|---|---|
| uops | 2.50 | 2.50 | 2.33 | 2.33 | 3.00 | 2.50 | 2.50 | 2.33 |
| cycles | 2.50 | 3.25 | 2.33 | 2.33 | 3.00 | 2.50 | 2.50 | 2.33 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 4.25 |
| Dispatch | 3.25 |
| Overall L1 | 4.25 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 8% |
| load | 9% |
| store | 9% |
| mul | 6% |
| add-sub | 9% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 6% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOV 0x200(%RSP),%EDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (6.3%) |
| INC %EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| ADD %R12,%R10 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0x18(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| CMP %ECX,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| JE 43c0 <main+0x1dd0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV %EDX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| IMUL %R12D,%EAX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| MOV %EDX,0x200(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (6.3%) |
| IMUL %ECX,%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (6.3%) |
| MOV %RDX,0x10(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| ADD 0x80(%RSP),%EAX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| SETB %AL | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| OR 0x1e0(%RSP),%AL | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| MOV %AL,0x40(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | N/A |
| XOR %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| JMP 4469 <main+0x1e79> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
