| Loop Id: 30 | Module: attention-icx-skl256 | Source: attention_v2.cpp:43-273 [...] | Coverage: 0.22% |
|---|
| Loop Id: 30 | Module: attention-icx-skl256 | Source: attention_v2.cpp:43-273 [...] | Coverage: 0.22% |
|---|
0x404fe0 MOV 0x138(%RSP),%RAX |
0x404fe8 MOV 0x1d8(%RSP),%RCX |
0x404ff0 VMOVSS %XMM12,(%RAX,%RCX,4) |
0x404ff5 MOV 0x1c8(%RSP),%RAX |
0x404ffd MOV 0x270(%RSP),%RDX |
0x405005 ADD %RAX,%RDX |
0x405008 MOV 0x268(%RSP),%RSI |
0x405010 INC %RSI |
0x405013 MOV 0xe8(%RSP),%RDI |
0x40501b ADD %RAX,%RDI |
0x40501e MOV 0x148(%RSP),%RAX |
0x405026 ADD 0x1a4(%RSP),%EAX |
0x40502d MOV %RAX,0x148(%RSP) |
0x405035 CMP 0x190(%RSP),%RCX |
0x40503d JE 4054a0 |
0x405043 LEA 0x1(%R8),%R9 |
0x405047 VMOVSS 0x8fbd(%RIP),%XMM8 |
0x40504f CMP $0x8,%R9 |
0x405053 JB 4050a0 |
0x405055 MOV %RSI,%RAX |
0x405058 SHR $0x3,%RAX |
0x40505c MOV %RDX,%RCX |
0x40505f VMOVSS 0x8fa5(%RIP),%XMM8 |
0x405067 NOPW (%RAX,%RAX,1) |
(33) 0x405070 VMAXSS -0x1c(%RCX),%XMM8,%XMM0 |
(33) 0x405075 VMAXSS -0x18(%RCX),%XMM0,%XMM0 |
(33) 0x40507a VMAXSS -0x14(%RCX),%XMM0,%XMM0 |
(33) 0x40507f VMAXSS -0x10(%RCX),%XMM0,%XMM0 |
(33) 0x405084 VMAXSS -0xc(%RCX),%XMM0,%XMM0 |
(33) 0x405089 VMAXSS -0x8(%RCX),%XMM0,%XMM0 |
(33) 0x40508e VMAXSS -0x4(%RCX),%XMM0,%XMM0 |
(33) 0x405093 VMAXSS (%RCX),%XMM0,%XMM8 |
(33) 0x405097 ADD $0x20,%RCX |
(33) 0x40509b DEC %RAX |
(33) 0x40509e JNE 405070 |
0x4050a0 MOV %R9,%RCX |
0x4050a3 AND $-0x8,%RCX |
0x4050a7 CMP %R9,%RCX |
0x4050aa JE 4050bd |
0x4050ac MOV %RCX,%RAX |
0x4050af NOP |
(32) 0x4050b0 VMAXSS (%RDI,%RAX,4),%XMM8,%XMM8 |
(32) 0x4050b5 INC %RAX |
(32) 0x4050b8 CMP %RAX,%RSI |
(32) 0x4050bb JNE 4050b0 |
0x4050bd MOV %RDI,0xe8(%RSP) |
0x4050c5 MOV %RSI,0x268(%RSP) |
0x4050cd MOV %RDX,0x270(%RSP) |
0x4050d5 VPBROADCASTQ %RCX,%YMM9 |
0x4050db TEST %RCX,%RCX |
0x4050de MOV %R8,0x1d8(%RSP) |
0x4050e6 MOV %R9,0x140(%RSP) |
0x4050ee MOV %RCX,0x200(%RSP) |
0x4050f6 VPBROADCASTQ %R9,%YMM11 |
0x4050fc JE 405270 |
0x405102 LEA -0x1(%RCX),%RAX |
0x405106 MOV %RAX,0x90(%RSP) |
0x40510e VBROADCASTSS %XMM8,%YMM10 |
0x405113 MOV 0x130(%RSP),%RAX |
0x40511b IMUL %R8,%RAX |
0x40511f MOV %RAX,0x1e0(%RSP) |
0x405127 VXORPS %XMM12,%XMM12,%XMM12 |
0x40512c XOR %ESI,%ESI |
0x40512e MOV 0xe8(%RSP),%RDI |
0x405136 NOPW %CS:(%RAX,%RAX,1) |
(31) 0x405140 VMOVUPS (%RDI,%RSI,4),%YMM0 |
(31) 0x405145 VSUBPS %YMM10,%YMM0,%YMM0 |
(31) 0x40514a CALL 4068b0 <__svml_expf8_l9> |
(31) 0x405150 VADDPS %YMM0,%YMM12,%YMM12 |
(31) 0x405154 ADD $0x8,%RSI |
(31) 0x405158 CMP 0x90(%RSP),%RSI |
(31) 0x405160 JLE 405140 |
0x405162 VEXTRACTF128 $0x1,%YMM12,%XMM0 |
0x405168 VADDPS %XMM0,%XMM12,%XMM0 |
0x40516c VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 |
0x405171 VADDPS %XMM1,%XMM0,%XMM0 |
0x405175 VMOVSHDUP %XMM0,%XMM1 |
0x405179 VADDSS %XMM1,%XMM0,%XMM12 |
0x40517d VPCMPEQQ %YMM9,%YMM11,%K0 |
0x405183 KMOVD %K0,%ECX |
0x405187 MOV 0x200(%RSP),%RDX |
0x40518f MOV %RDX,%RAX |
0x405192 TEST $0x1,%CL |
0x405195 JE 405290 |
0x40519b TEST %RDX,%RDX |
0x40519e JE 405334 |
0x4051a4 MOV 0x148(%RSP),%EAX |
0x4051ab MOV 0xb0(%RSP),%RCX |
0x4051b3 LEA (%RCX,%RAX,4),%RAX |
0x4051b7 MOV %RAX,0x90(%RSP) |
0x4051bf LEA -0x1(%RDX),%RAX |
0x4051c3 MOV %RAX,0x120(%RSP) |
0x4051cb VBROADCASTSS %XMM8,%YMM10 |
0x4051d0 VBROADCASTSS %XMM12,%YMM8 |
0x4051d5 VMOVSS 0x8e33(%RIP),%XMM0 |
0x4051dd VDIVSS %XMM12,%XMM0,%XMM0 |
0x4051e2 VBROADCASTSS %XMM0,%YMM11 |
0x4051e7 XOR %EDI,%EDI |
0x4051e9 MOV 0xe8(%RSP),%RSI |
0x4051f1 NOPW %CS:(%RAX,%RAX,1) |
(24) 0x405200 VMOVUPS (%RSI,%RDI,4),%YMM0 |
(24) 0x405205 VSUBPS %YMM10,%YMM0,%YMM0 |
(24) 0x40520a CALL 4068b0 <__svml_expf8_l9> |
(24) 0x405210 VMULPS %YMM0,%YMM11,%YMM0 |
(24) 0x405214 MOV 0x90(%RSP),%RAX |
(24) 0x40521c VMOVUPS %YMM0,(%RAX,%RDI,4) |
(24) 0x405221 ADD $0x8,%RDI |
(24) 0x405225 CMP 0x120(%RSP),%RDI |
(24) 0x40522d JLE 405200 |
0x40522f MOV 0x140(%RSP),%R8 |
0x405237 VPBROADCASTQ %R8,%YMM0 |
0x40523d VPCMPEQQ %YMM9,%YMM0,%K0 |
0x405243 KMOVD %K0,%EAX |
0x405247 TEST $0x1,%AL |
0x405249 MOV 0x200(%RSP),%RCX |
0x405251 JE 40534e |
0x405257 CMP 0x130(%RSP),%R8 |
0x40525f JAE 404fe0 |
0x405265 JMP 405406 |
0x405270 MOV 0x130(%RSP),%RAX |
0x405278 IMUL %R8,%RAX |
0x40527c MOV %RAX,0x1e0(%RSP) |
0x405284 VBROADCASTSS %XMM8,%YMM10 |
0x405289 VXORPS %XMM12,%XMM12,%XMM12 |
0x40528e XOR %EAX,%EAX |
0x405290 VPBROADCASTQ %RAX,%YMM0 |
0x405296 VPSUBQ %YMM0,%YMM11,%YMM1 |
0x40529a VPSUBQ %YMM0,%YMM11,%YMM0 |
0x40529e VPCMPNLEUQ 0x8df7(%RIP),%YMM0,%K0 |
0x4052a9 VPCMPNLEUQ 0x8e0c(%RIP),%YMM1,%K1 |
0x4052b4 KSHIFTLB $0x4,%K1,%K1 |
0x4052ba KORB %K1,%K0,%K1 |
0x4052be KMOVW %K1,0x90(%RSP) |
0x4052c7 ADD 0x1e0(%RSP),%RAX |
0x4052cf MOV 0xc0(%RSP),%RCX |
0x4052d7 VMOVUPS (%RCX,%RAX,4),%YMM0{%K1}{z} |
0x4052de VMOVAPS %YMM0,%YMM13{%K1} |
0x4052e4 VSUBPS %YMM10,%YMM13,%YMM0 |
0x4052e9 VPMOVM2D %K1,%YMM1 |
0x4052ef CALL 406c50 <__svml_expf8_mask_e9> |
0x4052f5 MOV 0x200(%RSP),%RDX |
0x4052fd KMOVW 0x90(%RSP),%K1 |
0x405306 VMOVAPS %YMM0,%YMM0{%K1}{z} |
0x40530c VEXTRACTF128 $0x1,%YMM0,%XMM1 |
0x405312 VADDPS %XMM1,%XMM0,%XMM0 |
0x405316 VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 |
0x40531b VADDPS %XMM1,%XMM0,%XMM0 |
0x40531f VMOVSHDUP %XMM0,%XMM1 |
0x405323 VADDSS %XMM1,%XMM0,%XMM0 |
0x405327 VADDSS %XMM0,%XMM12,%XMM12 |
0x40532b TEST %RDX,%RDX |
0x40532e JNE 4051a4 |
0x405334 MOV 0x140(%RSP),%RAX |
0x40533c VPBROADCASTQ %RAX,%YMM0 |
0x405342 VBROADCASTSS %XMM8,%YMM10 |
0x405347 VBROADCASTSS %XMM12,%YMM8 |
0x40534c XOR %ECX,%ECX |
0x40534e MOV 0x1e0(%RSP),%RDX |
0x405356 MOV %EDX,%EDI |
0x405358 VPBROADCASTQ %RCX,%YMM1 |
0x40535e VPSUBQ %YMM1,%YMM0,%YMM2 |
0x405362 VPSUBQ %YMM1,%YMM0,%YMM0 |
0x405366 VPCMPNLEUQ 0x8d2f(%RIP),%YMM0,%K0 |
0x405371 VPCMPNLEUQ 0x8d44(%RIP),%YMM2,%K1 |
0x40537c KSHIFTLB $0x4,%K1,%K1 |
0x405382 KORB %K1,%K0,%K1 |
0x405386 KMOVW %K1,0x90(%RSP) |
0x40538f ADD %RCX,%RDX |
0x405392 MOV 0xc0(%RSP),%RAX |
0x40539a VMOVUPS (%RAX,%RDX,4),%YMM0{%K1}{z} |
0x4053a1 VMOVAPS %YMM0,%YMM14{%K1} |
0x4053a7 VSUBPS %YMM10,%YMM14,%YMM0 |
0x4053ac VPMOVM2D %K1,%YMM1 |
0x4053b2 MOV %RCX,%RSI |
0x4053b5 CALL 406c50 <__svml_expf8_mask_e9> |
0x4053bb MOV 0x140(%RSP),%R8 |
0x4053c3 VRCP14PS %YMM8,%YMM1 |
0x4053c9 VFMSUB213PS 0x40e010{1to8},%YMM1,%YMM8 |
0x4053d4 VMULPS %YMM0,%YMM1,%YMM0 |
0x4053d8 VFNMADD213PS %YMM0,%YMM8,%YMM0 |
0x4053dd ADD %RSI,%RDI |
0x4053e0 MOV 0xb0(%RSP),%RAX |
0x4053e8 KMOVW 0x90(%RSP),%K1 |
0x4053f1 VMOVUPS %YMM0,(%RAX,%RDI,4){%K1} |
0x4053f8 CMP 0x130(%RSP),%R8 |
0x405400 JAE 404fe0 |
0x405406 MOV 0x240(%RSP),%RDI |
0x40540e MOV 0x1d8(%RSP),%RCX |
0x405416 IMUL %RCX,%RDI |
0x40541a ADD $0x4,%RDI |
0x40541e MOV $0x3fffffffc,%RAX |
0x405428 AND %RAX,%RDI |
0x40542b MOV 0x250(%RSP),%RAX |
0x405433 SUB %RCX,%RAX |
0x405436 ADD 0xb0(%RSP),%RDI |
0x40543e MOV %EAX,%EAX |
0x405440 LEA 0x4(,%RAX,4),%RDX |
0x405448 XOR %ESI,%ESI |
0x40544a VMOVUPS %YMM13,0x200(%RSP) |
0x405453 VMOVUPS %YMM14,0x1e0(%RSP) |
0x40545c VMOVUPS %XMM12,0x90(%RSP) |
0x405465 VZEROUPPER |
0x405468 CALL 4094f0 <_intel_fast_memset> |
0x40546d VMOVUPS 0x90(%RSP),%XMM12 |
0x405476 MOV 0x140(%RSP),%R8 |
0x40547e VMOVUPS 0x1e0(%RSP),%YMM14 |
0x405487 VMOVUPS 0x200(%RSP),%YMM13 |
0x405490 JMP 404fe0 |
/home/eoseret/Applications/llm-attention/attention_v2.cpp: 43 - 273 |
-------------------------------------------------------------------------------- |
43: for (int row = 0; row < N; ++row) { |
44: const float *S_row = &S[row * N]; |
45: |
46: float max_val = -FLT_MAX; |
47: for (int idx = 0; idx <= row; ++idx) // vectorised |
48: if (S_row[idx] > max_val) max_val = S_row[idx]; |
49: |
50: float sum = 0.0f; |
51: #pragma clang loop vectorize(enable) |
52: for (int idx = 0; idx <= row; ++idx) // vectorised |
53: sum += expf(S_row[idx] - max_val); |
54: |
55: for (int idx = 0; idx <= row; ++idx) //vectorised |
56: P[row * N + idx] = expf(S_row[idx] - max_val) / sum; |
57: |
58: for (int idx = row + 1; idx < N; ++idx) |
59: P[row * N + idx] = 0.0f; |
60: |
61: D[row] = sum; |
[...] |
98: if (argc < 4) { |
[...] |
273: start = std::chrono::steady_clock::now(); |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | __libc_init_first | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | attention-icx-skl256 |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 2.18 |
| CQA speedup if FP arith vectorized | 2.36 |
| CQA speedup if fully vectorized | 8.69 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.83 |
| Bottlenecks | micro-operation queue, |
| Function | main |
| Source | attention_v2.cpp:43-44,attention_v2.cpp:47-47,attention_v2.cpp:52-61,attention_v2.cpp:98-98,attention_v2.cpp:273-273 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 45.75 |
| CQA cycles if no scalar integer | 21.00 |
| CQA cycles if FP arith vectorized | 19.38 |
| CQA cycles if fully vectorized | 5.26 |
| Front-end cycles | 45.75 |
| P0 cycles | 23.67 |
| P1 cycles | 23.58 |
| P2 cycles | 22.00 |
| P3 cycles | 22.00 |
| P4 cycles | 22.00 |
| P5 cycles | 25.00 |
| P6 cycles | 23.75 |
| P7 cycles | 22.00 |
| DIV/SQRT cycles | 3.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 177.00 |
| Nb uops | 183.00 |
| Nb loads | 48.00 |
| Nb stores | 19.00 |
| Nb stack references | 19.00 |
| FLOP/cycle | 1.84 |
| Nb FLOP add-sub | 35.00 |
| Nb FLOP mul | 8.00 |
| Nb FLOP fma | 16.00 |
| Nb FLOP div | 1.00 |
| Nb FLOP rcp | 8.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 16.70 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 548.00 |
| Bytes stored | 216.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 45.56 |
| Vectorization ratio load | 55.56 |
| Vectorization ratio store | 23.53 |
| Vectorization ratio mul | 100.00 |
| Vectorization ratio add_sub | 71.43 |
| Vectorization ratio fma | 100.00 |
| Vectorization ratio div_sqrt | 50.00 |
| Vectorization ratio other | 40.91 |
| Vector-efficiency ratio all | 23.72 |
| Vector-efficiency ratio load | 30.90 |
| Vector-efficiency ratio store | 19.49 |
| Vector-efficiency ratio mul | 50.00 |
| Vector-efficiency ratio add_sub | 30.80 |
| Vector-efficiency ratio fma | 50.00 |
| Vector-efficiency ratio div_sqrt | 28.13 |
| Vector-efficiency ratio other | 20.38 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 2.18 |
| CQA speedup if FP arith vectorized | 2.36 |
| CQA speedup if fully vectorized | 8.69 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.83 |
| Bottlenecks | micro-operation queue, |
| Function | main |
| Source | attention_v2.cpp:43-44,attention_v2.cpp:47-47,attention_v2.cpp:52-61,attention_v2.cpp:98-98,attention_v2.cpp:273-273 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 45.75 |
| CQA cycles if no scalar integer | 21.00 |
| CQA cycles if FP arith vectorized | 19.38 |
| CQA cycles if fully vectorized | 5.26 |
| Front-end cycles | 45.75 |
| P0 cycles | 23.67 |
| P1 cycles | 23.58 |
| P2 cycles | 22.00 |
| P3 cycles | 22.00 |
| P4 cycles | 22.00 |
| P5 cycles | 25.00 |
| P6 cycles | 23.75 |
| P7 cycles | 22.00 |
| DIV/SQRT cycles | 3.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 177.00 |
| Nb uops | 183.00 |
| Nb loads | 48.00 |
| Nb stores | 19.00 |
| Nb stack references | 19.00 |
| FLOP/cycle | 1.84 |
| Nb FLOP add-sub | 35.00 |
| Nb FLOP mul | 8.00 |
| Nb FLOP fma | 16.00 |
| Nb FLOP div | 1.00 |
| Nb FLOP rcp | 8.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 16.70 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 548.00 |
| Bytes stored | 216.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 45.56 |
| Vectorization ratio load | 55.56 |
| Vectorization ratio store | 23.53 |
| Vectorization ratio mul | 100.00 |
| Vectorization ratio add_sub | 71.43 |
| Vectorization ratio fma | 100.00 |
| Vectorization ratio div_sqrt | 50.00 |
| Vectorization ratio other | 40.91 |
| Vector-efficiency ratio all | 23.72 |
| Vector-efficiency ratio load | 30.90 |
| Vector-efficiency ratio store | 19.49 |
| Vector-efficiency ratio mul | 50.00 |
| Vector-efficiency ratio add_sub | 30.80 |
| Vector-efficiency ratio fma | 50.00 |
| Vector-efficiency ratio div_sqrt | 28.13 |
| Vector-efficiency ratio other | 20.38 |
| Path / |
| Function | main |
| Source file and lines | attention_v2.cpp:43-273 |
| Module | attention-icx-skl256 |
| nb instructions | 177 |
| nb uops | 183 |
| loop length | 1057 |
| used x86 registers | 8 |
| used mmx registers | 0 |
| used xmm registers | 4 |
| used ymm registers | 10 |
| used zmm registers | 0 |
| nb stack references | 19 |
| ADD-SUB / MUL ratio | 9.00 |
| micro-operation queue | 45.75 cycles |
| front end | 45.75 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
|---|---|---|---|---|---|---|---|---|
| uops | 23.67 | 23.58 | 22.00 | 22.00 | 22.00 | 25.00 | 23.75 | 22.00 |
| cycles | 23.67 | 23.58 | 22.00 | 22.00 | 22.00 | 25.00 | 23.75 | 22.00 |
| Cycles executing div or sqrt instructions | 3.00 |
| Front-end | 45.75 |
| Dispatch | 25.00 |
| DIV/SQRT | 3.00 |
| Overall L1 | 45.75 |
| all | 24% |
| load | 44% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 80% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 26% |
| all | 66% |
| load | 66% |
| store | 80% |
| mul | 100% |
| add-sub | 66% |
| fma | 100% |
| div/sqrt | 50% |
| other | 61% |
| all | 45% |
| load | 55% |
| store | 23% |
| mul | 100% |
| add-sub | 71% |
| fma | 100% |
| div/sqrt | 50% |
| other | 40% |
| all | 20% |
| load | 29% |
| store | 12% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 42% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 20% |
| all | 27% |
| load | 32% |
| store | 36% |
| mul | 50% |
| add-sub | 24% |
| fma | 50% |
| div/sqrt | 28% |
| other | 20% |
| all | 23% |
| load | 30% |
| store | 19% |
| mul | 50% |
| add-sub | 30% |
| fma | 50% |
| div/sqrt | 28% |
| other | 20% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOV 0x138(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV 0x1d8(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VMOVSS %XMM12,(%RAX,%RCX,4) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (6.3%) |
| MOV 0x1c8(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV 0x270(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| ADD %RAX,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0x268(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| INC %RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0xe8(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| ADD %RAX,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0x148(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| ADD 0x1a4(%RSP),%EAX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| MOV %RAX,0x148(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| CMP 0x190(%RSP),%RCX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| JE 4054a0 <main+0x2380> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| LEA 0x1(%R8),%R9 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| VMOVSS 0x8fbd(%RIP),%XMM8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (6.3%) |
| CMP $0x8,%R9 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| JB 4050a0 <main+0x1f80> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV %RSI,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| SHR $0x3,%RAX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| MOV %RDX,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VMOVSS 0x8fa5(%RIP),%XMM8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (6.3%) |
| NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV %R9,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| AND $-0x8,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| CMP %R9,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| JE 4050bd <main+0x1f9d> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV %RCX,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| NOP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV %RDI,0xe8(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %RSI,0x268(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %RDX,0x270(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| VPBROADCASTQ %RCX,%YMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| TEST %RCX,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %R8,0x1d8(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %R9,0x140(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %RCX,0x200(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| VPBROADCASTQ %R9,%YMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| JE 405270 <main+0x2150> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| LEA -0x1(%RCX),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| MOV %RAX,0x90(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| VBROADCASTSS %XMM8,%YMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| MOV 0x130(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| IMUL %R8,%RAX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| MOV %RAX,0x1e0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| VXORPS %XMM12,%XMM12,%XMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| XOR %ESI,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| MOV 0xe8(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VEXTRACTF128 $0x1,%YMM12,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPS %XMM0,%XMM12,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (25.0%) |
| VADDPS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VMOVSHDUP %XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (12.5%) |
| VADDSS %XMM1,%XMM0,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| VPCMPEQQ %YMM9,%YMM11,%K0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (50.0%) |
| KMOVD %K0,%ECX | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | N/A |
| MOV 0x200(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV %RDX,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| TEST $0x1,%CL | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| JE 405290 <main+0x2170> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| TEST %RDX,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| JE 405334 <main+0x2214> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV 0x148(%RSP),%EAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV 0xb0(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| LEA (%RCX,%RAX,4),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| MOV %RAX,0x90(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| LEA -0x1(%RDX),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| MOV %RAX,0x120(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| VBROADCASTSS %XMM8,%YMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| VBROADCASTSS %XMM12,%YMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| VMOVSS 0x8e33(%RIP),%XMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (6.3%) |
| VDIVSS %XMM12,%XMM0,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | 3 | scal (6.3%) |
| VBROADCASTSS %XMM0,%YMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| XOR %EDI,%EDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV 0xe8(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV 0x140(%RSP),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VPBROADCASTQ %R8,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| VPCMPEQQ %YMM9,%YMM0,%K0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (50.0%) |
| KMOVD %K0,%EAX | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | N/A |
| TEST $0x1,%AL | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0x200(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| JE 40534e <main+0x222e> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| CMP 0x130(%RSP),%R8 | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| JAE 404fe0 <main+0x1ec0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| JMP 405406 <main+0x22e6> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| MOV 0x130(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| IMUL %R8,%RAX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| MOV %RAX,0x1e0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| VBROADCASTSS %XMM8,%YMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| VXORPS %XMM12,%XMM12,%XMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| XOR %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VPBROADCASTQ %RAX,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| VPSUBQ %YMM0,%YMM11,%YMM1 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 | vect (50.0%) |
| VPSUBQ %YMM0,%YMM11,%YMM0 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 | vect (50.0%) |
| VPCMPNLEUQ 0x8df7(%RIP),%YMM0,%K0 | vect (50.0%) | |||||||||||
| VPCMPNLEUQ 0x8e0c(%RIP),%YMM1,%K1 | vect (50.0%) | |||||||||||
| KSHIFTLB $0x4,%K1,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 4 | 1 | N/A |
| KORB %K1,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | N/A |
| KMOVW %K1,0x90(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 4 | 1 | N/A |
| ADD 0x1e0(%RSP),%RAX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| MOV 0xc0(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VMOVUPS (%RCX,%RAX,4),%YMM0{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 | vect (50.0%) |
| VMOVAPS %YMM0,%YMM13{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (50.0%) |
| VSUBPS %YMM10,%YMM13,%YMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (50.0%) |
| VPMOVM2D %K1,%YMM1 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 | scal (1.6%) |
| CALL 406c50 <__svml_expf8_mask_e9> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 | N/A |
| MOV 0x200(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| KMOVW 0x90(%RSP),%K1 | 3 | 0 | 0 | 0.50 | 0.50 | 0 | 1 | 0 | 0 | 2 | 1 | N/A |
| VMOVAPS %YMM0,%YMM0{%K1}{z} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (50.0%) |
| VEXTRACTF128 $0x1,%YMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (25.0%) |
| VADDPS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VMOVSHDUP %XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (12.5%) |
| VADDSS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| VADDSS %XMM0,%XMM12,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| TEST %RDX,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| JNE 4051a4 <main+0x2084> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV 0x140(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VPBROADCASTQ %RAX,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| VBROADCASTSS %XMM8,%YMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| VBROADCASTSS %XMM12,%YMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV 0x1e0(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV %EDX,%EDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VPBROADCASTQ %RCX,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| VPSUBQ %YMM1,%YMM0,%YMM2 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 | vect (50.0%) |
| VPSUBQ %YMM1,%YMM0,%YMM0 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 | vect (50.0%) |
| VPCMPNLEUQ 0x8d2f(%RIP),%YMM0,%K0 | vect (50.0%) | |||||||||||
| VPCMPNLEUQ 0x8d44(%RIP),%YMM2,%K1 | vect (50.0%) | |||||||||||
| KSHIFTLB $0x4,%K1,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 4 | 1 | N/A |
| KORB %K1,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | N/A |
| KMOVW %K1,0x90(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 4 | 1 | N/A |
| ADD %RCX,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0xc0(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VMOVUPS (%RAX,%RDX,4),%YMM0{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 | vect (50.0%) |
| VMOVAPS %YMM0,%YMM14{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (50.0%) |
| VSUBPS %YMM10,%YMM14,%YMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (50.0%) |
| VPMOVM2D %K1,%YMM1 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 | scal (1.6%) |
| MOV %RCX,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| CALL 406c50 <__svml_expf8_mask_e9> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 | N/A |
| MOV 0x140(%RSP),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VRCP14PS %YMM8,%YMM1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 | vect (50.0%) |
| VFMSUB213PS 0x40e010{1to8},%YMM1,%YMM8 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (50.0%) |
| VMULPS %YMM0,%YMM1,%YMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (50.0%) |
| VFNMADD213PS %YMM0,%YMM8,%YMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (50.0%) |
| ADD %RSI,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0xb0(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| KMOVW 0x90(%RSP),%K1 | 3 | 0 | 0 | 0.50 | 0.50 | 0 | 1 | 0 | 0 | 2 | 1 | N/A |
| VMOVUPS %YMM0,(%RAX,%RDI,4){%K1} | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | vect (50.0%) |
| CMP 0x130(%RSP),%R8 | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| JAE 404fe0 <main+0x1ec0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV 0x240(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV 0x1d8(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| IMUL %RCX,%RDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| ADD $0x4,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV $0x3fffffffc,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| AND %RAX,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0x250(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| SUB %RCX,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| ADD 0xb0(%RSP),%RDI | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| MOV %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| LEA 0x4(,%RAX,4),%RDX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| XOR %ESI,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| VMOVUPS %YMM13,0x200(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | vect (50.0%) |
| VMOVUPS %YMM14,0x1e0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | vect (50.0%) |
| VMOVUPS %XMM12,0x90(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | vect (25.0%) |
| VZEROUPPER | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | vect (25.0%) |
| CALL 4094f0 <_intel_fast_memset> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 | N/A |
| VMOVUPS 0x90(%RSP),%XMM12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | vect (25.0%) |
| MOV 0x140(%RSP),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VMOVUPS 0x1e0(%RSP),%YMM14 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 | vect (50.0%) |
| VMOVUPS 0x200(%RSP),%YMM13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 | vect (50.0%) |
| JMP 404fe0 <main+0x1ec0> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| Function | main |
| Source file and lines | attention_v2.cpp:43-273 |
| Module | attention-icx-skl256 |
| nb instructions | 177 |
| nb uops | 183 |
| loop length | 1057 |
| used x86 registers | 8 |
| used mmx registers | 0 |
| used xmm registers | 4 |
| used ymm registers | 10 |
| used zmm registers | 0 |
| nb stack references | 19 |
| ADD-SUB / MUL ratio | 9.00 |
| micro-operation queue | 45.75 cycles |
| front end | 45.75 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
|---|---|---|---|---|---|---|---|---|
| uops | 23.67 | 23.58 | 22.00 | 22.00 | 22.00 | 25.00 | 23.75 | 22.00 |
| cycles | 23.67 | 23.58 | 22.00 | 22.00 | 22.00 | 25.00 | 23.75 | 22.00 |
| Cycles executing div or sqrt instructions | 3.00 |
| Front-end | 45.75 |
| Dispatch | 25.00 |
| DIV/SQRT | 3.00 |
| Overall L1 | 45.75 |
| all | 24% |
| load | 44% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 80% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 26% |
| all | 66% |
| load | 66% |
| store | 80% |
| mul | 100% |
| add-sub | 66% |
| fma | 100% |
| div/sqrt | 50% |
| other | 61% |
| all | 45% |
| load | 55% |
| store | 23% |
| mul | 100% |
| add-sub | 71% |
| fma | 100% |
| div/sqrt | 50% |
| other | 40% |
| all | 20% |
| load | 29% |
| store | 12% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 42% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 20% |
| all | 27% |
| load | 32% |
| store | 36% |
| mul | 50% |
| add-sub | 24% |
| fma | 50% |
| div/sqrt | 28% |
| other | 20% |
| all | 23% |
| load | 30% |
| store | 19% |
| mul | 50% |
| add-sub | 30% |
| fma | 50% |
| div/sqrt | 28% |
| other | 20% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOV 0x138(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV 0x1d8(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VMOVSS %XMM12,(%RAX,%RCX,4) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (6.3%) |
| MOV 0x1c8(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV 0x270(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| ADD %RAX,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0x268(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| INC %RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0xe8(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| ADD %RAX,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0x148(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| ADD 0x1a4(%RSP),%EAX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| MOV %RAX,0x148(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| CMP 0x190(%RSP),%RCX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| JE 4054a0 <main+0x2380> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| LEA 0x1(%R8),%R9 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| VMOVSS 0x8fbd(%RIP),%XMM8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (6.3%) |
| CMP $0x8,%R9 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| JB 4050a0 <main+0x1f80> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV %RSI,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| SHR $0x3,%RAX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| MOV %RDX,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VMOVSS 0x8fa5(%RIP),%XMM8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (6.3%) |
| NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV %R9,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| AND $-0x8,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| CMP %R9,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| JE 4050bd <main+0x1f9d> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV %RCX,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| NOP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV %RDI,0xe8(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %RSI,0x268(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %RDX,0x270(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| VPBROADCASTQ %RCX,%YMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| TEST %RCX,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %R8,0x1d8(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %R9,0x140(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %RCX,0x200(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| VPBROADCASTQ %R9,%YMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| JE 405270 <main+0x2150> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| LEA -0x1(%RCX),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| MOV %RAX,0x90(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| VBROADCASTSS %XMM8,%YMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| MOV 0x130(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| IMUL %R8,%RAX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| MOV %RAX,0x1e0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| VXORPS %XMM12,%XMM12,%XMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| XOR %ESI,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| MOV 0xe8(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VEXTRACTF128 $0x1,%YMM12,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPS %XMM0,%XMM12,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (25.0%) |
| VADDPS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VMOVSHDUP %XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (12.5%) |
| VADDSS %XMM1,%XMM0,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| VPCMPEQQ %YMM9,%YMM11,%K0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (50.0%) |
| KMOVD %K0,%ECX | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | N/A |
| MOV 0x200(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV %RDX,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| TEST $0x1,%CL | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| JE 405290 <main+0x2170> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| TEST %RDX,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| JE 405334 <main+0x2214> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV 0x148(%RSP),%EAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV 0xb0(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| LEA (%RCX,%RAX,4),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| MOV %RAX,0x90(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| LEA -0x1(%RDX),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| MOV %RAX,0x120(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| VBROADCASTSS %XMM8,%YMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| VBROADCASTSS %XMM12,%YMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| VMOVSS 0x8e33(%RIP),%XMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (6.3%) |
| VDIVSS %XMM12,%XMM0,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | 3 | scal (6.3%) |
| VBROADCASTSS %XMM0,%YMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| XOR %EDI,%EDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV 0xe8(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV 0x140(%RSP),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VPBROADCASTQ %R8,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| VPCMPEQQ %YMM9,%YMM0,%K0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (50.0%) |
| KMOVD %K0,%EAX | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | N/A |
| TEST $0x1,%AL | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0x200(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| JE 40534e <main+0x222e> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| CMP 0x130(%RSP),%R8 | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| JAE 404fe0 <main+0x1ec0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| JMP 405406 <main+0x22e6> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| MOV 0x130(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| IMUL %R8,%RAX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| MOV %RAX,0x1e0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| VBROADCASTSS %XMM8,%YMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| VXORPS %XMM12,%XMM12,%XMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| XOR %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VPBROADCASTQ %RAX,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| VPSUBQ %YMM0,%YMM11,%YMM1 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 | vect (50.0%) |
| VPSUBQ %YMM0,%YMM11,%YMM0 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 | vect (50.0%) |
| VPCMPNLEUQ 0x8df7(%RIP),%YMM0,%K0 | vect (50.0%) | |||||||||||
| VPCMPNLEUQ 0x8e0c(%RIP),%YMM1,%K1 | vect (50.0%) | |||||||||||
| KSHIFTLB $0x4,%K1,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 4 | 1 | N/A |
| KORB %K1,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | N/A |
| KMOVW %K1,0x90(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 4 | 1 | N/A |
| ADD 0x1e0(%RSP),%RAX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| MOV 0xc0(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VMOVUPS (%RCX,%RAX,4),%YMM0{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 | vect (50.0%) |
| VMOVAPS %YMM0,%YMM13{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (50.0%) |
| VSUBPS %YMM10,%YMM13,%YMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (50.0%) |
| VPMOVM2D %K1,%YMM1 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 | scal (1.6%) |
| CALL 406c50 <__svml_expf8_mask_e9> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 | N/A |
| MOV 0x200(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| KMOVW 0x90(%RSP),%K1 | 3 | 0 | 0 | 0.50 | 0.50 | 0 | 1 | 0 | 0 | 2 | 1 | N/A |
| VMOVAPS %YMM0,%YMM0{%K1}{z} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (50.0%) |
| VEXTRACTF128 $0x1,%YMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (25.0%) |
| VADDPS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VMOVSHDUP %XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (12.5%) |
| VADDSS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| VADDSS %XMM0,%XMM12,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| TEST %RDX,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| JNE 4051a4 <main+0x2084> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV 0x140(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VPBROADCASTQ %RAX,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| VBROADCASTSS %XMM8,%YMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| VBROADCASTSS %XMM12,%YMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV 0x1e0(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV %EDX,%EDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VPBROADCASTQ %RCX,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| VPSUBQ %YMM1,%YMM0,%YMM2 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 | vect (50.0%) |
| VPSUBQ %YMM1,%YMM0,%YMM0 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 | vect (50.0%) |
| VPCMPNLEUQ 0x8d2f(%RIP),%YMM0,%K0 | vect (50.0%) | |||||||||||
| VPCMPNLEUQ 0x8d44(%RIP),%YMM2,%K1 | vect (50.0%) | |||||||||||
| KSHIFTLB $0x4,%K1,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 4 | 1 | N/A |
| KORB %K1,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | N/A |
| KMOVW %K1,0x90(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 4 | 1 | N/A |
| ADD %RCX,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0xc0(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VMOVUPS (%RAX,%RDX,4),%YMM0{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 | vect (50.0%) |
| VMOVAPS %YMM0,%YMM14{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (50.0%) |
| VSUBPS %YMM10,%YMM14,%YMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (50.0%) |
| VPMOVM2D %K1,%YMM1 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 | scal (1.6%) |
| MOV %RCX,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| CALL 406c50 <__svml_expf8_mask_e9> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 | N/A |
| MOV 0x140(%RSP),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VRCP14PS %YMM8,%YMM1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 | vect (50.0%) |
| VFMSUB213PS 0x40e010{1to8},%YMM1,%YMM8 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (50.0%) |
| VMULPS %YMM0,%YMM1,%YMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (50.0%) |
| VFNMADD213PS %YMM0,%YMM8,%YMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (50.0%) |
| ADD %RSI,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0xb0(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| KMOVW 0x90(%RSP),%K1 | 3 | 0 | 0 | 0.50 | 0.50 | 0 | 1 | 0 | 0 | 2 | 1 | N/A |
| VMOVUPS %YMM0,(%RAX,%RDI,4){%K1} | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | vect (50.0%) |
| CMP 0x130(%RSP),%R8 | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| JAE 404fe0 <main+0x1ec0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV 0x240(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV 0x1d8(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| IMUL %RCX,%RDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| ADD $0x4,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV $0x3fffffffc,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| AND %RAX,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0x250(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| SUB %RCX,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| ADD 0xb0(%RSP),%RDI | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| MOV %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| LEA 0x4(,%RAX,4),%RDX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| XOR %ESI,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| VMOVUPS %YMM13,0x200(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | vect (50.0%) |
| VMOVUPS %YMM14,0x1e0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | vect (50.0%) |
| VMOVUPS %XMM12,0x90(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | vect (25.0%) |
| VZEROUPPER | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | vect (25.0%) |
| CALL 4094f0 <_intel_fast_memset> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 | N/A |
| VMOVUPS 0x90(%RSP),%XMM12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | vect (25.0%) |
| MOV 0x140(%RSP),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VMOVUPS 0x1e0(%RSP),%YMM14 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 | vect (50.0%) |
| VMOVUPS 0x200(%RSP),%YMM13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 | vect (50.0%) |
| JMP 404fe0 <main+0x1ec0> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
