| Loop Id: 27 | Module: attention-avx512 | Source: attention.cpp:27-33 | Coverage: 0.90% |
|---|
| Loop Id: 27 | Module: attention-avx512 | Source: attention.cpp:27-33 | Coverage: 0.90% |
|---|
0x407140 XOR %R12D,%R12D |
0x407143 VMOVAPD %ZMM9,%ZMM10 |
0x407149 MOV 0x70(%RSP),%RDX |
0x40714e ADD %R12D,%EDX |
0x407151 VMOVSS (%R13,%RDX,4),%XMM9 |
0x407158 IMUL 0x18(%RSP),%R12D |
0x40715e LEA (%RBX,%R12,1),%EDX |
0x407162 LEA 0x5(%RBX,%R12,1),%ESI |
0x407167 VPBROADCASTD %EDX,%XMM11 |
0x40716d VPADDD %YMM3,%YMM11,%YMM12 |
0x407171 VPERMT2D %YMM11,%YMM4,%YMM12 |
0x407177 VPBROADCASTD %ESI,%YMM13 |
0x40717d VPBLENDD $0x20,%YMM13,%YMM12,%YMM12 |
0x407183 VPADDD %YMM5,%YMM11,%YMM11 |
0x407187 VPERMT2Q %YMM11,%YMM6,%YMM12 |
0x40718d VPSUBD %YMM2,%YMM12,%YMM11 |
0x407191 VPXOR %XMM12,%XMM12,%XMM12 |
0x407196 KXNORB %K0,%K0,%K1 |
0x40719a VGATHERDPS (%RAX,%YMM11,4),%YMM12{%K1} |
0x4071a1 VCVTSS2SD %XMM9,%XMM9,%XMM9 |
0x4071a6 VCVTPS2PD %YMM12,%ZMM11 |
0x4071ac VBROADCASTSD %XMM9,%ZMM9 |
0x4071b2 VFMADD213PD %ZMM10,%ZMM11,%ZMM9 |
0x4071b8 VCVTSD2SS %XMM9,%XMM9,%XMM11 |
0x4071bd MOV 0x50(%RSP),%R11 |
0x4071c2 LEA (%R11,%RBX,1),%EDX |
0x4071c6 MOV 0xe0(%RSP),%RSI |
0x4071ce VMOVSS %XMM11,(%RSI,%RDX,4) |
0x4071d3 VSHUFPD $0x1,%XMM9,%XMM9,%XMM10 |
0x4071d9 VCVTSD2SS %XMM10,%XMM10,%XMM10 |
0x4071de LEA 0x1(%R11,%RBX,1),%EDX |
0x4071e3 VMOVSS %XMM10,(%RSI,%RDX,4) |
0x4071e8 VEXTRACTF128 $0x1,%YMM9,%XMM10 |
0x4071ee VCVTSD2SS %XMM10,%XMM10,%XMM11 |
0x4071f3 LEA 0x2(%R11,%RBX,1),%EDX |
0x4071f8 VMOVSS %XMM11,(%RSI,%RDX,4) |
0x4071fd VSHUFPD $0x1,%XMM10,%XMM10,%XMM10 |
0x407203 VCVTSD2SS %XMM10,%XMM10,%XMM10 |
0x407208 LEA 0x3(%R11,%RBX,1),%EDX |
0x40720d VMOVSS %XMM10,(%RSI,%RDX,4) |
0x407212 VEXTRACTF32X4 $0x2,%ZMM9,%XMM10 |
0x407219 VCVTSD2SS %XMM10,%XMM10,%XMM11 |
0x40721e LEA 0x4(%R11,%RBX,1),%EDX |
0x407223 VMOVSS %XMM11,(%RSI,%RDX,4) |
0x407228 VSHUFPD $0x1,%XMM10,%XMM10,%XMM10 |
0x40722e VCVTSD2SS %XMM10,%XMM10,%XMM10 |
0x407233 LEA 0x5(%R11,%RBX,1),%EDX |
0x407238 VMOVSS %XMM10,(%RSI,%RDX,4) |
0x40723d VEXTRACTF32X4 $0x3,%ZMM9,%XMM9 |
0x407244 VCVTSD2SS %XMM9,%XMM9,%XMM10 |
0x407249 LEA 0x6(%R11,%RBX,1),%EDX |
0x40724e VMOVSS %XMM10,(%RSI,%RDX,4) |
0x407253 VSHUFPD $0x1,%XMM9,%XMM9,%XMM9 |
0x407259 VCVTSD2SS %XMM9,%XMM9,%XMM9 |
0x40725e LEA 0x7(%R11,%RBX,1),%EDX |
0x407263 MOV %RSI,%RBX |
0x407266 VMOVSS %XMM9,(%RSI,%RDX,4) |
0x40726b MOV 0x40(%RSP),%EDX |
0x40726f ADD $0x8,%EDX |
0x407272 MOV 0x30(%RSP),%RSI |
0x407277 ADD $0x8,%ESI |
0x40727a CMP 0x38(%RSP),%R14 |
0x40727f LEA 0x1(%R14),%R14 |
0x407283 JE 4073c0 |
0x407289 LEA (,%R14,8),%EBX |
0x407291 TEST %R8,%R8 |
0x407294 MOV %RSI,0x30(%RSP) |
0x407299 MOV %EDX,0x40(%RSP) |
0x40729d VXORPS %XMM9,%XMM9,%XMM9 |
0x4072a2 JE 407140 |
0x4072a8 MOV %EDX,%R8D |
0x4072ab MOV %R13,%R11 |
0x4072ae MOV 0x28(%RSP),%EDX |
0x4072b2 MOV %EDX,%R13D |
0x4072b5 XOR %R12D,%R12D |
0x4072b8 NOPL (%RAX,%RAX,1) |
(28) 0x4072c0 LEA -0x1(%R13),%EDX |
(28) 0x4072c4 VMOVSS (%R11,%RDX,4),%XMM10 |
(28) 0x4072ca VCVTSS2SD %XMM10,%XMM10,%XMM10 |
(28) 0x4072cf LEA 0x5(%RSI),%EDX |
(28) 0x4072d2 VPBROADCASTD %ESI,%XMM11 |
(28) 0x4072d8 VPADDD %YMM7,%YMM11,%YMM12 |
(28) 0x4072dc VPERMT2D %YMM11,%YMM4,%YMM12 |
(28) 0x4072e2 VPBROADCASTD %EDX,%YMM13 |
(28) 0x4072e8 VPBLENDD $0x20,%YMM13,%YMM12,%YMM12 |
(28) 0x4072ee VPADDD %YMM8,%YMM11,%YMM11 |
(28) 0x4072f3 VPERMT2Q %YMM11,%YMM6,%YMM12 |
(28) 0x4072f9 VPSUBD %YMM2,%YMM12,%YMM11 |
(28) 0x4072fd VPXOR %XMM12,%XMM12,%XMM12 |
(28) 0x407302 KXNORB %K0,%K0,%K1 |
(28) 0x407306 VGATHERDPS (%RAX,%YMM11,4),%YMM12{%K1} |
(28) 0x40730d VCVTPS2PD %YMM12,%ZMM11 |
(28) 0x407313 VBROADCASTSD %XMM10,%ZMM10 |
(28) 0x407319 VMULPD %ZMM10,%ZMM11,%ZMM10 |
(28) 0x40731f VADDPD %ZMM10,%ZMM9,%ZMM9 |
(28) 0x407325 MOV %R13D,%EDX |
(28) 0x407328 VMOVSS (%R11,%RDX,4),%XMM10 |
(28) 0x40732e LEA 0x5(%R8),%EDX |
(28) 0x407332 VPBROADCASTD %R8D,%XMM11 |
(28) 0x407338 VPADDD %YMM7,%YMM11,%YMM12 |
(28) 0x40733c VPERMT2D %YMM11,%YMM4,%YMM12 |
(28) 0x407342 VPBROADCASTD %EDX,%YMM13 |
(28) 0x407348 VPBLENDD $0x20,%YMM13,%YMM12,%YMM12 |
(28) 0x40734e VPADDD %YMM8,%YMM11,%YMM11 |
(28) 0x407353 VPERMT2Q %YMM11,%YMM6,%YMM12 |
(28) 0x407359 VPSUBD %YMM2,%YMM12,%YMM11 |
(28) 0x40735d VPXOR %XMM12,%XMM12,%XMM12 |
(28) 0x407362 KXNORB %K0,%K0,%K1 |
(28) 0x407366 VGATHERDPS (%RAX,%YMM11,4),%YMM12{%K1} |
(28) 0x40736d VCVTSS2SD %XMM10,%XMM10,%XMM10 |
(28) 0x407372 VCVTPS2PD %YMM12,%ZMM11 |
(28) 0x407378 VBROADCASTSD %XMM10,%ZMM10 |
(28) 0x40737e VMULPD %ZMM10,%ZMM11,%ZMM10 |
(28) 0x407384 VADDPD %ZMM10,%ZMM9,%ZMM9 |
(28) 0x40738a ADD $0x2,%R12 |
(28) 0x40738e ADD $0x2,%R13D |
(28) 0x407392 ADD %R10D,%R8D |
(28) 0x407395 ADD %R10D,%ESI |
(28) 0x407398 CMP %R12,%RDI |
(28) 0x40739b JNE 4072c0 |
0x4073a1 TESTB $0x1,0x60(%RSP) |
0x4073a6 MOV %R11,%R13 |
0x4073a9 MOV 0xb0(%RSP),%R8 |
0x4073b1 JNE 407143 |
0x4073b7 JMP 4071b8 |
/home/eoseret/llm-attention/attention.cpp: 27 - 33 |
-------------------------------------------------------------------------------- |
27: for (unsigned int j = 0; j < N; ++j) { //vectorized |
28: double sum = 0.0; |
29: #pragma clang loop vectorize(enable) |
30: for (unsigned int k = 0; k < K; ++k) { //vectorized |
31: sum += (double)A[i * K + k] * (double)B[k * N + j]; |
32: } |
33: C[i * N + j] = alpha * static_cast<float>(sum); |
| Coverage (%) | Name | Source Location | Module |
|---|
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.00 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 2.08 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.18 |
| Bottlenecks | |
| Function | main |
| Source | attention.cpp:27-27,attention.cpp:30-33 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 15.67 |
| CQA cycles if no scalar integer | 15.67 |
| CQA cycles if FP arith vectorized | 15.67 |
| CQA cycles if fully vectorized | 7.54 |
| Front-end cycles | 13.50 |
| P0 cycles | 13.33 |
| P1 cycles | 14.33 |
| P2 cycles | 4.78 |
| P3 cycles | 4.78 |
| P4 cycles | 5.00 |
| P5 cycles | 15.67 |
| P6 cycles | 5.33 |
| P7 cycles | 5.00 |
| P8 cycles | 5.00 |
| P9 cycles | 5.00 |
| P10 cycles | 5.33 |
| P11 cycles | 4.78 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | 0 |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 69.00 |
| Nb uops | 81.00 |
| Nb loads | 9.67 |
| Nb stores | 10.00 |
| Nb stack references | 8.33 |
| FLOP/cycle | 0.68 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 5.33 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 7.80 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 76.67 |
| Bytes stored | 44.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 36.58 |
| Vectorization ratio load | 19.44 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | 100.00 |
| Vectorization ratio fma | 100.00 |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 46.65 |
| Vector-efficiency ratio all | 20.30 |
| Vector-efficiency ratio load | 18.58 |
| Vector-efficiency ratio store | 6.88 |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | 50.00 |
| Vector-efficiency ratio fma | 100.00 |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 22.15 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.00 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 1.73 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.13 |
| Bottlenecks | P5, |
| Function | main |
| Source | attention.cpp:27-27,attention.cpp:30-33 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 18.00 |
| CQA cycles if no scalar integer | 18.00 |
| CQA cycles if FP arith vectorized | 18.00 |
| CQA cycles if fully vectorized | 10.43 |
| Front-end cycles | 14.00 |
| P0 cycles | 16.00 |
| P1 cycles | 16.00 |
| P2 cycles | 5.33 |
| P3 cycles | 5.33 |
| P4 cycles | 5.00 |
| P5 cycles | 18.00 |
| P6 cycles | 5.00 |
| P7 cycles | 5.00 |
| P8 cycles | 5.00 |
| P9 cycles | 5.00 |
| P10 cycles | 5.00 |
| P11 cycles | 5.33 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | 0 |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 70.00 |
| Nb uops | 84.00 |
| Nb loads | 9.00 |
| Nb stores | 10.00 |
| Nb stack references | 7.00 |
| FLOP/cycle | 0.89 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 8.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 7.11 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 84.00 |
| Bytes stored | 44.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 44.19 |
| Vectorization ratio load | 33.33 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | 100.00 |
| Vectorization ratio fma | 100.00 |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 53.57 |
| Vector-efficiency ratio all | 24.27 |
| Vector-efficiency ratio load | 22.92 |
| Vector-efficiency ratio store | 6.88 |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | 50.00 |
| Vector-efficiency ratio fma | 100.00 |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 25.67 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.00 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 6.29 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.38 |
| Bottlenecks | micro-operation queue, P1, P5, |
| Function | main |
| Source | attention.cpp:27-27,attention.cpp:30-33 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 11.00 |
| CQA cycles if no scalar integer | 11.00 |
| CQA cycles if FP arith vectorized | 11.00 |
| CQA cycles if fully vectorized | 1.75 |
| Front-end cycles | 11.00 |
| P0 cycles | 8.00 |
| P1 cycles | 11.00 |
| P2 cycles | 2.67 |
| P3 cycles | 2.67 |
| P4 cycles | 5.00 |
| P5 cycles | 11.00 |
| P6 cycles | 5.00 |
| P7 cycles | 5.00 |
| P8 cycles | 5.00 |
| P9 cycles | 5.00 |
| P10 cycles | 5.00 |
| P11 cycles | 2.67 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | 0 |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 58.00 |
| Nb uops | 66.00 |
| Nb loads | 8.00 |
| Nb stores | 10.00 |
| Nb stack references | 8.00 |
| FLOP/cycle | 0.00 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 8.45 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 49.00 |
| Bytes stored | 44.00 |
| Stride 0 | 0.00 |
| Stride 1 | 0.00 |
| Stride n | 0.00 |
| Stride unknown | 1.00 |
| Stride indirect | 1.00 |
| Vectorization ratio all | 24.24 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | NA |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 36.36 |
| Vector-efficiency ratio all | 13.26 |
| Vector-efficiency ratio load | 12.50 |
| Vector-efficiency ratio store | 6.88 |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | NA |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 16.19 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.00 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 1.72 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.13 |
| Bottlenecks | P5, |
| Function | main |
| Source | attention.cpp:27-27,attention.cpp:30-33 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 18.00 |
| CQA cycles if no scalar integer | 18.00 |
| CQA cycles if FP arith vectorized | 18.00 |
| CQA cycles if fully vectorized | 10.45 |
| Front-end cycles | 15.50 |
| P0 cycles | 16.00 |
| P1 cycles | 16.00 |
| P2 cycles | 6.33 |
| P3 cycles | 6.33 |
| P4 cycles | 5.00 |
| P5 cycles | 18.00 |
| P6 cycles | 6.00 |
| P7 cycles | 5.00 |
| P8 cycles | 5.00 |
| P9 cycles | 5.00 |
| P10 cycles | 6.00 |
| P11 cycles | 6.33 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | 0 |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 79.00 |
| Nb uops | 93.00 |
| Nb loads | 12.00 |
| Nb stores | 10.00 |
| Nb stack references | 10.00 |
| FLOP/cycle | 0.89 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 8.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 7.83 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 97.00 |
| Bytes stored | 44.00 |
| Stride 0 | 0.00 |
| Stride 1 | 0.00 |
| Stride n | 0.00 |
| Stride unknown | 1.00 |
| Stride indirect | 1.00 |
| Vectorization ratio all | 41.30 |
| Vectorization ratio load | 25.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | 100.00 |
| Vectorization ratio fma | 100.00 |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 50.00 |
| Vector-efficiency ratio all | 23.37 |
| Vector-efficiency ratio load | 20.31 |
| Vector-efficiency ratio store | 6.88 |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | 50.00 |
| Vector-efficiency ratio fma | 100.00 |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 24.58 |
| Path / |
| Function | main |
| Source file and lines | attention.cpp:27-33 |
| Module | attention-avx512 |
| nb instructions | 69 |
| nb uops | 81 |
| loop length | 351.33 |
| used x86 registers | 10 |
| used mmx registers | 0 |
| used xmm registers | 3.67 |
| used ymm registers | 6.33 |
| used zmm registers | 2.33 |
| nb stack references | 8.33 |
| micro-operation queue | 13.50 cycles |
| front end | 13.50 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 13.33 | 14.33 | 4.78 | 4.78 | 5.00 | 15.67 | 5.33 | 5.00 | 5.00 | 5.00 | 5.33 | 4.78 |
| cycles | 13.33 | 14.33 | 4.78 | 4.78 | 5.00 | 15.67 | 5.33 | 5.00 | 5.00 | 5.00 | 5.33 | 4.78 |
| Cycles executing div or sqrt instructions | NA |
| Longest recurrence chain latency (RecMII) | 0.00 |
| Front-end | 13.50 |
| Dispatch | 15.67 |
| Data deps. | 0.00 |
| Overall L1 | 15.67 |
| all | 35% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 100% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 33% |
| all | 36% |
| load | 50% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | 100% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 51% |
| all | 36% |
| load | 19% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 100% |
| fma | 100% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 46% |
| all | 22% |
| load | 12% |
| store | 9% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 50% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 20% |
| all | 19% |
| load | 28% |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | 100% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 22% |
| all | 20% |
| load | 18% |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 50% |
| fma | 100% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 22% |
| Function | main |
| Source file and lines | attention.cpp:27-33 |
| Module | attention-avx512 |
| nb instructions | 70 |
| nb uops | 84 |
| loop length | 360 |
| used x86 registers | 10 |
| used mmx registers | 0 |
| used xmm registers | 4 |
| used ymm registers | 9 |
| used zmm registers | 3 |
| nb stack references | 7 |
| micro-operation queue | 14.00 cycles |
| front end | 14.00 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 16.00 | 16.00 | 5.33 | 5.33 | 5.00 | 18.00 | 5.00 | 5.00 | 5.00 | 5.00 | 5.00 | 5.33 |
| cycles | 16.00 | 16.00 | 5.33 | 5.33 | 5.00 | 18.00 | 5.00 | 5.00 | 5.00 | 5.00 | 5.00 | 5.33 |
| Cycles executing div or sqrt instructions | NA |
| Longest recurrence chain latency (RecMII) | 0.00 |
| Front-end | 14.00 |
| Dispatch | 18.00 |
| Data deps. | 0.00 |
| Overall L1 | 18.00 |
| all | 58% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 100% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 57% |
| all | 38% |
| load | 50% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | 100% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 52% |
| all | 44% |
| load | 33% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 100% |
| fma | 100% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 53% |
| all | 30% |
| load | 12% |
| store | 9% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 50% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 28% |
| all | 21% |
| load | 28% |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | 100% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 24% |
| all | 24% |
| load | 22% |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 50% |
| fma | 100% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 25% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| XOR %R12D,%R12D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
| VMOVAPD %ZMM9,%ZMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 | vect (100.0%) |
| MOV 0x70(%RSP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| ADD %R12D,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 | N/A |
| VMOVSS (%R13,%RDX,4),%XMM9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (6.3%) |
| IMUL 0x18(%RSP),%R12D | 1 | 0 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 3 | 1 | N/A |
| LEA (%RBX,%R12,1),%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 | N/A |
| LEA 0x5(%RBX,%R12,1),%ESI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VPBROADCASTD %EDX,%XMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (6.3%) |
| VPADDD %YMM3,%YMM11,%YMM12 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | vect (50.0%) |
| VPERMT2D %YMM11,%YMM4,%YMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (50.0%) |
| VPBROADCASTD %ESI,%YMM13 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (6.3%) |
| VPBLENDD $0x20,%YMM13,%YMM12,%YMM12 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | vect (50.0%) |
| VPADDD %YMM5,%YMM11,%YMM11 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | vect (50.0%) |
| VPERMT2Q %YMM11,%YMM6,%YMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (50.0%) |
| VPSUBD %YMM2,%YMM12,%YMM11 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.33 | vect (50.0%) |
| VPXOR %XMM12,%XMM12,%XMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| KXNORB %K0,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | N/A |
| VGATHERDPS (%RAX,%YMM11,4),%YMM12{%K1} | 5 | 1 | 1 | 2.67 | 2.67 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 2.67 | 29 | 3 | vect (50.0%) |
| VCVTSS2SD %XMM9,%XMM9,%XMM9 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 4-5 | 1 | scal (6.3%) |
| VCVTPS2PD %YMM12,%ZMM11 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 1 | vect (50.0%) |
| VBROADCASTSD %XMM9,%ZMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (12.5%) |
| VFMADD213PD %ZMM10,%ZMM11,%ZMM9 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VCVTSD2SS %XMM9,%XMM9,%XMM11 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| MOV 0x50(%RSP),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| LEA (%R11,%RBX,1),%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 | N/A |
| MOV 0xe0(%RSP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| VMOVSS %XMM11,(%RSI,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VSHUFPD $0x1,%XMM9,%XMM9,%XMM10 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VCVTSD2SS %XMM10,%XMM10,%XMM10 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x1(%R11,%RBX,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM10,(%RSI,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VEXTRACTF128 $0x1,%YMM9,%XMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VCVTSD2SS %XMM10,%XMM10,%XMM11 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x2(%R11,%RBX,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM11,(%RSI,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VSHUFPD $0x1,%XMM10,%XMM10,%XMM10 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VCVTSD2SS %XMM10,%XMM10,%XMM10 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x3(%R11,%RBX,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM10,(%RSI,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VEXTRACTF32X4 $0x2,%ZMM9,%XMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VCVTSD2SS %XMM10,%XMM10,%XMM11 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x4(%R11,%RBX,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM11,(%RSI,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VSHUFPD $0x1,%XMM10,%XMM10,%XMM10 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VCVTSD2SS %XMM10,%XMM10,%XMM10 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x5(%R11,%RBX,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM10,(%RSI,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VEXTRACTF32X4 $0x3,%ZMM9,%XMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VCVTSD2SS %XMM9,%XMM9,%XMM10 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x6(%R11,%RBX,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM10,(%RSI,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VSHUFPD $0x1,%XMM9,%XMM9,%XMM9 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VCVTSD2SS %XMM9,%XMM9,%XMM9 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x7(%R11,%RBX,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| MOV %RSI,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| VMOVSS %XMM9,(%RSI,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| MOV 0x40(%RSP),%EDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| ADD $0x8,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 | N/A |
| MOV 0x30(%RSP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| ADD $0x8,%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 | N/A |
| CMP 0x38(%RSP),%R14 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 | scal (12.5%) |
| LEA 0x1(%R14),%R14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
| JE 4073c0 <main+0x3080> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| LEA (,%R14,8),%EBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 | N/A |
| TEST %R8,%R8 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 | N/A |
| MOV %RSI,0x30(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| MOV %EDX,0x40(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VXORPS %XMM9,%XMM9,%XMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| JE 407140 <main+0x2e00> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| Function | main |
| Source file and lines | attention.cpp:27-33 |
| Module | attention-avx512 |
| nb instructions | 58 |
| nb uops | 66 |
| loop length | 291 |
| used x86 registers | 10 |
| used mmx registers | 0 |
| used xmm registers | 3 |
| used ymm registers | 1 |
| used zmm registers | 1 |
| nb stack references | 8 |
| micro-operation queue | 11.00 cycles |
| front end | 11.00 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 8.00 | 11.00 | 2.67 | 2.67 | 5.00 | 11.00 | 5.00 | 5.00 | 5.00 | 5.00 | 5.00 | 2.67 |
| cycles | 8.00 | 11.00 | 2.67 | 2.67 | 5.00 | 11.00 | 5.00 | 5.00 | 5.00 | 5.00 | 5.00 | 2.67 |
| Cycles executing div or sqrt instructions | NA |
| Longest recurrence chain latency (RecMII) | 0.00 |
| Front-end | 11.00 |
| Dispatch | 11.00 |
| Data deps. | 0.00 |
| Overall L1 | 11.00 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 0% |
| all | 33% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 50% |
| all | 24% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 36% |
| all | 9% |
| load | 12% |
| store | 9% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 9% |
| all | 14% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 18% |
| all | 13% |
| load | 12% |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 16% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| VCVTSD2SS %XMM9,%XMM9,%XMM11 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| MOV 0x50(%RSP),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| LEA (%R11,%RBX,1),%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 | N/A |
| MOV 0xe0(%RSP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| VMOVSS %XMM11,(%RSI,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VSHUFPD $0x1,%XMM9,%XMM9,%XMM10 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VCVTSD2SS %XMM10,%XMM10,%XMM10 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x1(%R11,%RBX,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM10,(%RSI,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VEXTRACTF128 $0x1,%YMM9,%XMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VCVTSD2SS %XMM10,%XMM10,%XMM11 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x2(%R11,%RBX,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM11,(%RSI,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VSHUFPD $0x1,%XMM10,%XMM10,%XMM10 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VCVTSD2SS %XMM10,%XMM10,%XMM10 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x3(%R11,%RBX,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM10,(%RSI,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VEXTRACTF32X4 $0x2,%ZMM9,%XMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VCVTSD2SS %XMM10,%XMM10,%XMM11 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x4(%R11,%RBX,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM11,(%RSI,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VSHUFPD $0x1,%XMM10,%XMM10,%XMM10 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VCVTSD2SS %XMM10,%XMM10,%XMM10 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x5(%R11,%RBX,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM10,(%RSI,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VEXTRACTF32X4 $0x3,%ZMM9,%XMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VCVTSD2SS %XMM9,%XMM9,%XMM10 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x6(%R11,%RBX,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM10,(%RSI,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VSHUFPD $0x1,%XMM9,%XMM9,%XMM9 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VCVTSD2SS %XMM9,%XMM9,%XMM9 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x7(%R11,%RBX,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| MOV %RSI,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| VMOVSS %XMM9,(%RSI,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| MOV 0x40(%RSP),%EDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| ADD $0x8,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 | N/A |
| MOV 0x30(%RSP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| ADD $0x8,%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 | N/A |
| CMP 0x38(%RSP),%R14 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 | scal (12.5%) |
| LEA 0x1(%R14),%R14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
| JE 4073c0 <main+0x3080> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| LEA (,%R14,8),%EBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 | N/A |
| TEST %R8,%R8 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 | scal (12.5%) |
| MOV %RSI,0x30(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| MOV %EDX,0x40(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VXORPS %XMM9,%XMM9,%XMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| JE 407140 <main+0x2e00> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| MOV %EDX,%R8D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (6.3%) |
| MOV %R13,%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| MOV 0x28(%RSP),%EDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| MOV %EDX,%R13D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (6.3%) |
| XOR %R12D,%R12D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | scal (6.3%) |
| NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
| TESTB $0x1,0x60(%RSP) | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 2 | 0.33 | N/A |
| MOV %R11,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (12.5%) |
| MOV 0xb0(%RSP),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
| JNE 407143 <main+0x2e03> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| JMP 4071b8 <main+0x2e78> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 | N/A |
| Function | main |
| Source file and lines | attention.cpp:27-33 |
| Module | attention-avx512 |
| nb instructions | 79 |
| nb uops | 93 |
| loop length | 403 |
| used x86 registers | 10 |
| used mmx registers | 0 |
| used xmm registers | 4 |
| used ymm registers | 9 |
| used zmm registers | 3 |
| nb stack references | 10 |
| micro-operation queue | 15.50 cycles |
| front end | 15.50 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 16.00 | 16.00 | 6.33 | 6.33 | 5.00 | 18.00 | 6.00 | 5.00 | 5.00 | 5.00 | 6.00 | 6.33 |
| cycles | 16.00 | 16.00 | 6.33 | 6.33 | 5.00 | 18.00 | 6.00 | 5.00 | 5.00 | 5.00 | 6.00 | 6.33 |
| Cycles executing div or sqrt instructions | NA |
| Longest recurrence chain latency (RecMII) | 0.00 |
| Front-end | 15.50 |
| Dispatch | 18.00 |
| Data deps. | 0.00 |
| Overall L1 | 18.00 |
| all | 46% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 100% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 44% |
| all | 38% |
| load | 50% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | 100% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 52% |
| all | 41% |
| load | 25% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 100% |
| fma | 100% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 50% |
| all | 26% |
| load | 12% |
| store | 9% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 50% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 24% |
| all | 21% |
| load | 28% |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | 100% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 24% |
| all | 23% |
| load | 20% |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 50% |
| fma | 100% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 24% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| VMOVAPD %ZMM9,%ZMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 | vect (100.0%) |
| MOV 0x70(%RSP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| ADD %R12D,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 | N/A |
| VMOVSS (%R13,%RDX,4),%XMM9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (6.3%) |
| IMUL 0x18(%RSP),%R12D | 1 | 0 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 3 | 1 | N/A |
| LEA (%RBX,%R12,1),%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 | N/A |
| LEA 0x5(%RBX,%R12,1),%ESI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VPBROADCASTD %EDX,%XMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (6.3%) |
| VPADDD %YMM3,%YMM11,%YMM12 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | vect (50.0%) |
| VPERMT2D %YMM11,%YMM4,%YMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (50.0%) |
| VPBROADCASTD %ESI,%YMM13 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (6.3%) |
| VPBLENDD $0x20,%YMM13,%YMM12,%YMM12 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | vect (50.0%) |
| VPADDD %YMM5,%YMM11,%YMM11 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | vect (50.0%) |
| VPERMT2Q %YMM11,%YMM6,%YMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (50.0%) |
| VPSUBD %YMM2,%YMM12,%YMM11 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.33 | vect (50.0%) |
| VPXOR %XMM12,%XMM12,%XMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| KXNORB %K0,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | N/A |
| VGATHERDPS (%RAX,%YMM11,4),%YMM12{%K1} | 5 | 1 | 1 | 2.67 | 2.67 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 2.67 | 29 | 3 | vect (50.0%) |
| VCVTSS2SD %XMM9,%XMM9,%XMM9 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 4-5 | 1 | scal (6.3%) |
| VCVTPS2PD %YMM12,%ZMM11 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 1 | vect (50.0%) |
| VBROADCASTSD %XMM9,%ZMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (12.5%) |
| VFMADD213PD %ZMM10,%ZMM11,%ZMM9 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VCVTSD2SS %XMM9,%XMM9,%XMM11 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| MOV 0x50(%RSP),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| LEA (%R11,%RBX,1),%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 | N/A |
| MOV 0xe0(%RSP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| VMOVSS %XMM11,(%RSI,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VSHUFPD $0x1,%XMM9,%XMM9,%XMM10 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VCVTSD2SS %XMM10,%XMM10,%XMM10 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x1(%R11,%RBX,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM10,(%RSI,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VEXTRACTF128 $0x1,%YMM9,%XMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VCVTSD2SS %XMM10,%XMM10,%XMM11 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x2(%R11,%RBX,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM11,(%RSI,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VSHUFPD $0x1,%XMM10,%XMM10,%XMM10 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VCVTSD2SS %XMM10,%XMM10,%XMM10 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x3(%R11,%RBX,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM10,(%RSI,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VEXTRACTF32X4 $0x2,%ZMM9,%XMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VCVTSD2SS %XMM10,%XMM10,%XMM11 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x4(%R11,%RBX,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM11,(%RSI,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VSHUFPD $0x1,%XMM10,%XMM10,%XMM10 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VCVTSD2SS %XMM10,%XMM10,%XMM10 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x5(%R11,%RBX,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM10,(%RSI,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VEXTRACTF32X4 $0x3,%ZMM9,%XMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VCVTSD2SS %XMM9,%XMM9,%XMM10 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x6(%R11,%RBX,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| VMOVSS %XMM10,(%RSI,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VSHUFPD $0x1,%XMM9,%XMM9,%XMM9 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VCVTSD2SS %XMM9,%XMM9,%XMM9 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | scal (12.5%) |
| LEA 0x7(%R11,%RBX,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| MOV %RSI,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| VMOVSS %XMM9,(%RSI,%RDX,4) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| MOV 0x40(%RSP),%EDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| ADD $0x8,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 | N/A |
| MOV 0x30(%RSP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| ADD $0x8,%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 | N/A |
| CMP 0x38(%RSP),%R14 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 | scal (12.5%) |
| LEA 0x1(%R14),%R14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
| JE 4073c0 <main+0x3080> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| LEA (,%R14,8),%EBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 | N/A |
| TEST %R8,%R8 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 | scal (12.5%) |
| MOV %RSI,0x30(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| MOV %EDX,0x40(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VXORPS %XMM9,%XMM9,%XMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| JE 407140 <main+0x2e00> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| MOV %EDX,%R8D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (6.3%) |
| MOV %R13,%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| MOV 0x28(%RSP),%EDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
| MOV %EDX,%R13D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| XOR %R12D,%R12D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
| NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
| TESTB $0x1,0x60(%RSP) | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 2 | 0.33 | N/A |
| MOV %R11,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| MOV 0xb0(%RSP),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
| JNE 407143 <main+0x2e03> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
