| Loop Id: 49 | Module: attention-avx512 | Source: attention.cpp:27-33 | Coverage: 9.49% |
|---|
| Loop Id: 49 | Module: attention-avx512 | Source: attention.cpp:27-33 | Coverage: 9.49% |
|---|
0x5360 VCVTSD2SS %XMM1,%XMM1,%XMM1 |
0x5364 VDIVSS %XMM0,%XMM1,%XMM1 |
0x5368 MOV 0x38(%RSP),%RAX |
0x536d ADD %ECX,%EAX |
0x536f MOV 0xd8(%RSP),%RBX |
0x5377 VMOVSS %XMM1,(%RBX,%RAX,4) |
0x537c INC %RCX |
0x537f MOV 0x70(%RSP),%R15 |
0x5384 CMP %R15,%RCX |
0x5387 JE 5300 |
0x538d VPXOR %XMM1,%XMM1,%XMM1 |
0x5391 MOV 0x68(%RSP),%R11 |
0x5396 CMP $0x4,%R11D |
0x539a JB 53b5 |
0x539c MOV %ECX,%EAX |
0x539e ADD 0x200(%RSP),%EAX |
0x53a5 SETB %AL |
0x53a8 OR 0x180(%RSP),%AL |
0x53af JE 54a0 |
0x53b5 XOR %EBX,%EBX |
0x53b7 MOV %RBX,%RAX |
0x53ba OR $0x1,%RAX |
0x53be TEST $0x1,%R13B |
0x53c2 JE 53fd |
0x53c4 MOV 0x100(%RSP),%R8 |
0x53cc LEA (%R8,%RBX,1),%R10D |
0x53d0 MOV 0x78(%RSP),%R8 |
0x53d5 VMOVSS (%R8,%R10,4),%XMM2 |
0x53db VCVTSS2SD %XMM2,%XMM2,%XMM2 |
0x53df MOV 0x60(%RSP),%R10 |
0x53e4 IMUL %EBX,%R10D |
0x53e8 ADD %ECX,%R10D |
0x53eb VMOVSS (%R12,%R10,4),%XMM3 |
0x53f1 VCVTSS2SD %XMM3,%XMM3,%XMM3 |
0x53f5 VFMADD231SD %XMM3,%XMM2,%XMM1 |
0x53fa MOV %RAX,%RBX |
0x53fd CMP %EAX,0x68(%RSP) |
0x5401 MOV 0x78(%RSP),%R15 |
0x5406 JE 5360 |
0x540c MOV 0x68(%RSP),%RAX |
0x5411 SUB %RBX,%RAX |
0x5414 MOV 0x60(%RSP),%R8 |
0x5419 MOV %R8,%R10 |
0x541c IMUL %RBX,%R10 |
0x5420 ADD %RCX,%R10 |
0x5423 LEA 0x1(%RBX),%R11 |
0x5427 IMUL %R8,%R11 |
0x542b ADD %RCX,%R11 |
0x542e ADD %R9D,%EBX |
0x5431 NOPW %CS:(%RAX,%RAX,1) |
(50) 0x5440 MOV %EBX,%R8D |
(50) 0x5443 VMOVSS (%R15,%R8,4),%XMM2 |
(50) 0x5449 VCVTSS2SD %XMM2,%XMM2,%XMM2 |
(50) 0x544d MOV %R10D,%R8D |
(50) 0x5450 VMOVSS (%R12,%R8,4),%XMM3 |
(50) 0x5456 VCVTSS2SD %XMM3,%XMM3,%XMM3 |
(50) 0x545a VFMADD213SD %XMM1,%XMM2,%XMM3 |
(50) 0x545f LEA 0x1(%RBX),%R8D |
(50) 0x5463 VMOVSS (%R15,%R8,4),%XMM1 |
(50) 0x5469 VCVTSS2SD %XMM1,%XMM1,%XMM2 |
(50) 0x546d MOV %R11D,%R8D |
(50) 0x5470 VMOVSS (%R12,%R8,4),%XMM1 |
(50) 0x5476 VCVTSS2SD %XMM1,%XMM1,%XMM1 |
(50) 0x547a VFMADD213SD %XMM3,%XMM2,%XMM1 |
(50) 0x547f ADD %RDI,%R10 |
(50) 0x5482 ADD %RDI,%R11 |
(50) 0x5485 ADD $0x2,%EBX |
(50) 0x5488 ADD $-0x2,%RAX |
(50) 0x548c JNE 5440 |
0x548e JMP 5360 |
0x54a0 CMP $0x20,%R11D |
0x54a4 JAE 54b2 |
0x54a6 XOR %EAX,%EAX |
0x54a8 MOV 0x78(%RSP),%R8 |
0x54ad JMP 5587 |
0x54b2 VPXOR %XMM1,%XMM1,%XMM1 |
0x54b6 XOR %EAX,%EAX |
0x54b8 VPXOR %XMM2,%XMM2,%XMM2 |
0x54bc VPXOR %XMM3,%XMM3,%XMM3 |
0x54c0 VPXOR %XMM4,%XMM4,%XMM4 |
0x54c4 MOV 0x78(%RSP),%R8 |
0x54c9 NOPL (%RAX) |
(51) 0x54d0 LEA (%R9,%RAX,1),%R10D |
(51) 0x54d4 VCVTPS2PD (%R8,%R10,4),%ZMM5 |
(51) 0x54db VCVTPS2PD 0x20(%R8,%R10,4),%ZMM6 |
(51) 0x54e3 VCVTPS2PD 0x40(%R8,%R10,4),%ZMM7 |
(51) 0x54eb VCVTPS2PD 0x60(%R8,%R10,4),%ZMM8 |
(51) 0x54f3 LEA (%RCX,%RAX,1),%R10D |
(51) 0x54f7 VCVTPS2PD (%R12,%R10,4),%ZMM9 |
(51) 0x54fe VCVTPS2PD 0x20(%R12,%R10,4),%ZMM10 |
(51) 0x5506 VFMADD231PD %ZMM9,%ZMM5,%ZMM1 |
(51) 0x550c VCVTPS2PD 0x40(%R12,%R10,4),%ZMM5 |
(51) 0x5514 VFMADD231PD %ZMM10,%ZMM6,%ZMM2 |
(51) 0x551a VCVTPS2PD 0x60(%R12,%R10,4),%ZMM6 |
(51) 0x5522 VFMADD231PD %ZMM5,%ZMM7,%ZMM3 |
(51) 0x5528 VFMADD231PD %ZMM6,%ZMM8,%ZMM4 |
(51) 0x552e ADD $0x20,%RAX |
(51) 0x5532 CMP %RAX,%RDX |
(51) 0x5535 JNE 54d0 |
0x5537 VADDPD %ZMM1,%ZMM2,%ZMM1 |
0x553d VADDPD %ZMM3,%ZMM4,%ZMM2 |
0x5543 VADDPD %ZMM1,%ZMM2,%ZMM1 |
0x5549 VEXTRACTF64X4 $0x1,%ZMM1,%YMM2 |
0x5550 VADDPD %ZMM2,%ZMM1,%ZMM1 |
0x5556 VEXTRACTF128 $0x1,%YMM1,%XMM2 |
0x555c VADDPD %XMM2,%XMM1,%XMM1 |
0x5560 VSHUFPD $0x1,%XMM1,%XMM1,%XMM2 |
0x5565 VADDSD %XMM2,%XMM1,%XMM1 |
0x5569 CMP %EDX,%R11D |
0x556c JE 5360 |
0x5572 MOV %RDX,%RAX |
0x5575 MOV %RDX,%RBX |
0x5578 TEST $0x1c,%R13B |
0x557c MOV 0x78(%RSP),%R8 |
0x5581 JE 53b7 |
0x5587 VMOVQ %XMM1,%XMM1 |
0x558b NOPL (%RAX,%RAX,1) |
(52) 0x5590 LEA (%R9,%RAX,1),%R10D |
(52) 0x5594 VCVTPS2PD (%R8,%R10,4),%YMM2 |
(52) 0x559a LEA (%RCX,%RAX,1),%R10D |
(52) 0x559e VCVTPS2PD (%R12,%R10,4),%YMM3 |
(52) 0x55a4 VFMADD231PD %YMM3,%YMM2,%YMM1 |
(52) 0x55a9 ADD $0x4,%RAX |
(52) 0x55ad CMP %RAX,%RSI |
(52) 0x55b0 JNE 5590 |
0x55b2 VEXTRACTF128 $0x1,%YMM1,%XMM2 |
0x55b8 VADDPD %XMM2,%XMM1,%XMM1 |
0x55bc VSHUFPD $0x1,%XMM1,%XMM1,%XMM2 |
0x55c1 VADDSD %XMM2,%XMM1,%XMM1 |
0x55c5 MOV %RSI,%RBX |
0x55c8 CMP %ESI,0x68(%RSP) |
0x55cc JE 5360 |
0x55d2 JMP 53b7 |
/home/eoseret/Applications/llm-attention/attention.cpp: 27 - 33 |
-------------------------------------------------------------------------------- |
27: for (unsigned int j = 0; j < N; ++j) { //vectorized |
28: double sum = 0.0; |
29: #pragma clang loop vectorize(enable) |
30: for (unsigned int k = 0; k < K; ++k) { //vectorized |
31: sum += (double)A[i * K + k] * (double)B[k * N + j]; |
32: } |
33: C[i * N + j] = alpha * static_cast<float>(sum); |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | __libc_init_first | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | attention-avx512 |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 2.36 |
| CQA speedup if FP arith vectorized | 1.06 |
| CQA speedup if fully vectorized | 1.37 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.59 |
| Bottlenecks | micro-operation queue, |
| Function | main |
| Source | attention.cpp:27-27,attention.cpp:30-33 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 23.00 |
| CQA cycles if no scalar integer | 9.75 |
| CQA cycles if FP arith vectorized | 21.69 |
| CQA cycles if fully vectorized | 16.80 |
| Front-end cycles | 23.00 |
| P0 cycles | 14.50 |
| P1 cycles | 14.50 |
| P2 cycles | 9.50 |
| P3 cycles | 9.50 |
| P4 cycles | 1.00 |
| P5 cycles | 14.50 |
| P6 cycles | 14.50 |
| P7 cycles | 1.00 |
| DIV/SQRT cycles | 3.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 89.00 |
| Nb uops | 92.00 |
| Nb loads | 19.00 |
| Nb stores | 1.00 |
| Nb stack references | 9.00 |
| FLOP/cycle | 1.78 |
| Nb FLOP add-sub | 38.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 1.00 |
| Nb FLOP div | 1.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 5.61 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 125.00 |
| Bytes stored | 4.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 42.11 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | 60.00 |
| Vectorization ratio fma | 0.00 |
| Vectorization ratio div_sqrt | 0.00 |
| Vectorization ratio other | 52.63 |
| Vector-efficiency ratio all | 24.67 |
| Vector-efficiency ratio load | 9.38 |
| Vector-efficiency ratio store | 6.25 |
| Vector-efficiency ratio mul | 12.50 |
| Vector-efficiency ratio add_sub | 50.00 |
| Vector-efficiency ratio fma | 12.50 |
| Vector-efficiency ratio div_sqrt | 6.25 |
| Vector-efficiency ratio other | 18.42 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 2.36 |
| CQA speedup if FP arith vectorized | 1.06 |
| CQA speedup if fully vectorized | 1.37 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.59 |
| Bottlenecks | micro-operation queue, |
| Function | main |
| Source | attention.cpp:27-27,attention.cpp:30-33 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 23.00 |
| CQA cycles if no scalar integer | 9.75 |
| CQA cycles if FP arith vectorized | 21.69 |
| CQA cycles if fully vectorized | 16.80 |
| Front-end cycles | 23.00 |
| P0 cycles | 14.50 |
| P1 cycles | 14.50 |
| P2 cycles | 9.50 |
| P3 cycles | 9.50 |
| P4 cycles | 1.00 |
| P5 cycles | 14.50 |
| P6 cycles | 14.50 |
| P7 cycles | 1.00 |
| DIV/SQRT cycles | 3.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 89.00 |
| Nb uops | 92.00 |
| Nb loads | 19.00 |
| Nb stores | 1.00 |
| Nb stack references | 9.00 |
| FLOP/cycle | 1.78 |
| Nb FLOP add-sub | 38.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 1.00 |
| Nb FLOP div | 1.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 5.61 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 125.00 |
| Bytes stored | 4.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 42.11 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | 60.00 |
| Vectorization ratio fma | 0.00 |
| Vectorization ratio div_sqrt | 0.00 |
| Vectorization ratio other | 52.63 |
| Vector-efficiency ratio all | 24.67 |
| Vector-efficiency ratio load | 9.38 |
| Vector-efficiency ratio store | 6.25 |
| Vector-efficiency ratio mul | 12.50 |
| Vector-efficiency ratio add_sub | 50.00 |
| Vector-efficiency ratio fma | 12.50 |
| Vector-efficiency ratio div_sqrt | 6.25 |
| Vector-efficiency ratio other | 18.42 |
| Path / |
| Function | main |
| Source file and lines | attention.cpp:27-33 |
| Module | attention-avx512 |
| nb instructions | 89 |
| nb uops | 92 |
| loop length | 403 |
| used x86 registers | 13 |
| used mmx registers | 0 |
| used xmm registers | 5 |
| used ymm registers | 2 |
| used zmm registers | 4 |
| nb stack references | 9 |
| micro-operation queue | 23.00 cycles |
| front end | 23.00 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
|---|---|---|---|---|---|---|---|---|
| uops | 14.50 | 14.50 | 9.50 | 9.50 | 1.00 | 14.50 | 14.50 | 1.00 |
| cycles | 14.50 | 14.50 | 9.50 | 9.50 | 1.00 | 14.50 | 14.50 | 1.00 |
| Cycles executing div or sqrt instructions | 3.00 |
| Front-end | 23.00 |
| Dispatch | 14.50 |
| DIV/SQRT | 3.00 |
| Overall L1 | 23.00 |
| all | 29% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 0% |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 45% |
| all | 52% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 75% |
| fma | 0% |
| div/sqrt | 0% |
| other | 62% |
| all | 42% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | 60% |
| fma | 0% |
| div/sqrt | 0% |
| other | 52% |
| all | 14% |
| load | 10% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 12% |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 15% |
| all | 32% |
| load | 6% |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 59% |
| fma | 12% |
| div/sqrt | 6% |
| other | 21% |
| all | 24% |
| load | 9% |
| store | 6% |
| mul | 12% |
| add-sub | 50% |
| fma | 12% |
| div/sqrt | 6% |
| other | 18% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| VCVTSD2SS %XMM1,%XMM1,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 5 | 1 | scal (12.5%) |
| VDIVSS %XMM0,%XMM1,%XMM1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | 3 | scal (6.3%) |
| MOV 0x38(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| ADD %ECX,%EAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0xd8(%RSP),%RBX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VMOVSS %XMM1,(%RBX,%RAX,4) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (6.3%) |
| INC %RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0x70(%RSP),%R15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| CMP %R15,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| JE 5300 <main+0x1b50> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| VPXOR %XMM1,%XMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| MOV 0x68(%RSP),%R11 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| CMP $0x4,%R11D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| JB 53b5 <main+0x1c05> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV %ECX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| ADD 0x200(%RSP),%EAX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| SETB %AL | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| OR 0x180(%RSP),%AL | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| JE 54a0 <main+0x1cf0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| XOR %EBX,%EBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV %RBX,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| OR $0x1,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| TEST $0x1,%R13B | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| JE 53fd <main+0x1c4d> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV 0x100(%RSP),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| LEA (%R8,%RBX,1),%R10D | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| MOV 0x78(%RSP),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VMOVSS (%R8,%R10,4),%XMM2 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (6.3%) |
| VCVTSS2SD %XMM2,%XMM2,%XMM2 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 5 | 1 | scal (6.3%) |
| MOV 0x60(%RSP),%R10 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| IMUL %EBX,%R10D | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| ADD %ECX,%R10D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| VMOVSS (%R12,%R10,4),%XMM3 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (6.3%) |
| VCVTSS2SD %XMM3,%XMM3,%XMM3 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 5 | 1 | scal (6.3%) |
| VFMADD231SD %XMM3,%XMM2,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| MOV %RAX,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| CMP %EAX,0x68(%RSP) | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (6.3%) |
| MOV 0x78(%RSP),%R15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| JE 5360 <main+0x1bb0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV 0x68(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| SUB %RBX,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0x60(%RSP),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV %R8,%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| IMUL %RBX,%R10 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| ADD %RCX,%R10 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| LEA 0x1(%RBX),%R11 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| IMUL %R8,%R11 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (12.5%) |
| ADD %RCX,%R11 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| ADD %R9D,%EBX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| JMP 5360 <main+0x1bb0> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| CMP $0x20,%R11D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| JAE 54b2 <main+0x1d02> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| XOR %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV 0x78(%RSP),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| JMP 5587 <main+0x1dd7> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| VPXOR %XMM1,%XMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| XOR %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VPXOR %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| VPXOR %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| VPXOR %XMM4,%XMM4,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| MOV 0x78(%RSP),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VADDPD %ZMM1,%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VADDPD %ZMM3,%ZMM4,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VADDPD %ZMM1,%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VEXTRACTF64X4 $0x1,%ZMM1,%YMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (50.0%) |
| VADDPD %ZMM2,%ZMM1,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VEXTRACTF128 $0x1,%YMM1,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPD %XMM2,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM1,%XMM1,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (25.0%) |
| VADDSD %XMM2,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| CMP %EDX,%R11D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| JE 5360 <main+0x1bb0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV %RDX,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV %RDX,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| TEST $0x1c,%R13B | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0x78(%RSP),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| JE 53b7 <main+0x1c07> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| VMOVQ %XMM1,%XMM1 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VEXTRACTF128 $0x1,%YMM1,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPD %XMM2,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM1,%XMM1,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (25.0%) |
| VADDSD %XMM2,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| MOV %RSI,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| CMP %ESI,0x68(%RSP) | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| JE 5360 <main+0x1bb0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| JMP 53b7 <main+0x1c07> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| Function | main |
| Source file and lines | attention.cpp:27-33 |
| Module | attention-avx512 |
| nb instructions | 89 |
| nb uops | 92 |
| loop length | 403 |
| used x86 registers | 13 |
| used mmx registers | 0 |
| used xmm registers | 5 |
| used ymm registers | 2 |
| used zmm registers | 4 |
| nb stack references | 9 |
| micro-operation queue | 23.00 cycles |
| front end | 23.00 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
|---|---|---|---|---|---|---|---|---|
| uops | 14.50 | 14.50 | 9.50 | 9.50 | 1.00 | 14.50 | 14.50 | 1.00 |
| cycles | 14.50 | 14.50 | 9.50 | 9.50 | 1.00 | 14.50 | 14.50 | 1.00 |
| Cycles executing div or sqrt instructions | 3.00 |
| Front-end | 23.00 |
| Dispatch | 14.50 |
| DIV/SQRT | 3.00 |
| Overall L1 | 23.00 |
| all | 29% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 0% |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 45% |
| all | 52% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 75% |
| fma | 0% |
| div/sqrt | 0% |
| other | 62% |
| all | 42% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | 60% |
| fma | 0% |
| div/sqrt | 0% |
| other | 52% |
| all | 14% |
| load | 10% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 12% |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 15% |
| all | 32% |
| load | 6% |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 59% |
| fma | 12% |
| div/sqrt | 6% |
| other | 21% |
| all | 24% |
| load | 9% |
| store | 6% |
| mul | 12% |
| add-sub | 50% |
| fma | 12% |
| div/sqrt | 6% |
| other | 18% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| VCVTSD2SS %XMM1,%XMM1,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 5 | 1 | scal (12.5%) |
| VDIVSS %XMM0,%XMM1,%XMM1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | 3 | scal (6.3%) |
| MOV 0x38(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| ADD %ECX,%EAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0xd8(%RSP),%RBX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VMOVSS %XMM1,(%RBX,%RAX,4) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (6.3%) |
| INC %RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0x70(%RSP),%R15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| CMP %R15,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| JE 5300 <main+0x1b50> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| VPXOR %XMM1,%XMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| MOV 0x68(%RSP),%R11 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| CMP $0x4,%R11D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| JB 53b5 <main+0x1c05> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV %ECX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| ADD 0x200(%RSP),%EAX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| SETB %AL | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| OR 0x180(%RSP),%AL | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| JE 54a0 <main+0x1cf0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| XOR %EBX,%EBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV %RBX,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| OR $0x1,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| TEST $0x1,%R13B | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| JE 53fd <main+0x1c4d> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV 0x100(%RSP),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| LEA (%R8,%RBX,1),%R10D | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| MOV 0x78(%RSP),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VMOVSS (%R8,%R10,4),%XMM2 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (6.3%) |
| VCVTSS2SD %XMM2,%XMM2,%XMM2 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 5 | 1 | scal (6.3%) |
| MOV 0x60(%RSP),%R10 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| IMUL %EBX,%R10D | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| ADD %ECX,%R10D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| VMOVSS (%R12,%R10,4),%XMM3 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (6.3%) |
| VCVTSS2SD %XMM3,%XMM3,%XMM3 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 5 | 1 | scal (6.3%) |
| VFMADD231SD %XMM3,%XMM2,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| MOV %RAX,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| CMP %EAX,0x68(%RSP) | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (6.3%) |
| MOV 0x78(%RSP),%R15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| JE 5360 <main+0x1bb0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV 0x68(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| SUB %RBX,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0x60(%RSP),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV %R8,%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| IMUL %RBX,%R10 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| ADD %RCX,%R10 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| LEA 0x1(%RBX),%R11 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| IMUL %R8,%R11 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (12.5%) |
| ADD %RCX,%R11 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| ADD %R9D,%EBX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| JMP 5360 <main+0x1bb0> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| CMP $0x20,%R11D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| JAE 54b2 <main+0x1d02> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| XOR %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV 0x78(%RSP),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| JMP 5587 <main+0x1dd7> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| VPXOR %XMM1,%XMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| XOR %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VPXOR %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| VPXOR %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| VPXOR %XMM4,%XMM4,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| MOV 0x78(%RSP),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VADDPD %ZMM1,%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VADDPD %ZMM3,%ZMM4,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VADDPD %ZMM1,%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VEXTRACTF64X4 $0x1,%ZMM1,%YMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (50.0%) |
| VADDPD %ZMM2,%ZMM1,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VEXTRACTF128 $0x1,%YMM1,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPD %XMM2,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM1,%XMM1,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (25.0%) |
| VADDSD %XMM2,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| CMP %EDX,%R11D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| JE 5360 <main+0x1bb0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV %RDX,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV %RDX,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| TEST $0x1c,%R13B | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0x78(%RSP),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| JE 53b7 <main+0x1c07> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| VMOVQ %XMM1,%XMM1 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VEXTRACTF128 $0x1,%YMM1,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPD %XMM2,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM1,%XMM1,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (25.0%) |
| VADDSD %XMM2,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| MOV %RSI,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| CMP %ESI,0x68(%RSP) | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| JE 5360 <main+0x1bb0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| JMP 53b7 <main+0x1c07> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
