| Loop Id: 22 | Module: attention-icx-skl512 | Source: attention_v2.cpp:43-273 [...] | Coverage: 0.51% |
|---|
| Loop Id: 22 | Module: attention-icx-skl512 | Source: attention_v2.cpp:43-273 [...] | Coverage: 0.51% |
|---|
0x405480 MOV 0x130(%RSP),%R12 |
0x405488 MOV 0x150(%RSP),%RCX |
0x405490 VMOVSS %XMM20,(%R12,%RCX,4) |
0x405497 MOV 0x1b8(%RSP),%RAX |
0x40549f MOV 0x250(%RSP),%RDX |
0x4054a7 ADD %RAX,%RDX |
0x4054aa MOV 0x248(%RSP),%RSI |
0x4054b2 INC %RSI |
0x4054b5 MOV 0x128(%RSP),%RDI |
0x4054bd ADD %RAX,%RDI |
0x4054c0 MOV 0x118(%RSP),%RAX |
0x4054c8 ADD 0x194(%RSP),%EAX |
0x4054cf MOV %RAX,0x118(%RSP) |
0x4054d7 CMP 0x188(%RSP),%RCX |
0x4054df JE 405970 |
0x4054e5 LEA 0x1(%R8),%R9 |
0x4054e9 VMOVSS 0x8b15(%RIP),%XMM16 |
0x4054f3 CMP $0x8,%R9 |
0x4054f7 JB 405544 |
0x4054f9 MOV %RSI,%RAX |
0x4054fc SHR $0x3,%RAX |
0x405500 MOV %RDX,%RCX |
0x405503 VMOVSS 0x8afb(%RIP),%XMM16 |
0x40550d NOPL (%RAX) |
(25) 0x405510 VMAXSS -0x1c(%RCX),%XMM16,%XMM0 |
(25) 0x405517 VMAXSS -0x18(%RCX),%XMM0,%XMM0 |
(25) 0x40551c VMAXSS -0x14(%RCX),%XMM0,%XMM0 |
(25) 0x405521 VMAXSS -0x10(%RCX),%XMM0,%XMM0 |
(25) 0x405526 VMAXSS -0xc(%RCX),%XMM0,%XMM0 |
(25) 0x40552b VMAXSS -0x8(%RCX),%XMM0,%XMM0 |
(25) 0x405530 VMAXSS -0x4(%RCX),%XMM0,%XMM0 |
(25) 0x405535 VMAXSS (%RCX),%XMM0,%XMM16 |
(25) 0x40553b ADD $0x20,%RCX |
(25) 0x40553f DEC %RAX |
(25) 0x405542 JNE 405510 |
0x405544 MOV %R9,%RAX |
0x405547 AND $-0x8,%RAX |
0x40554b CMP %R9,%RAX |
0x40554e MOV 0x78(%RSP),%R12 |
0x405553 JE 40556f |
0x405555 NOPW %CS:(%RAX,%RAX,1) |
(24) 0x405560 VMAXSS (%RDI,%RAX,4),%XMM16,%XMM16 |
(24) 0x405567 INC %RAX |
(24) 0x40556a CMP %RAX,%RSI |
(24) 0x40556d JNE 405560 |
0x40556f MOV %RDI,0x128(%RSP) |
0x405577 MOV %RSI,0x248(%RSP) |
0x40557f MOV %RDX,0x250(%RSP) |
0x405587 MOV %R9,%RDI |
0x40558a AND $-0x10,%RDI |
0x40558e VPBROADCASTQ %RDI,%ZMM17 |
0x405594 MOV %R8,0x150(%RSP) |
0x40559c MOV %R9,0x100(%RSP) |
0x4055a4 MOV %RDI,0x2c0(%RSP) |
0x4055ac VPBROADCASTQ %R9,%ZMM19 |
0x4055b2 JE 405740 |
0x4055b8 DEC %RDI |
0x4055bb VBROADCASTSS %XMM16,%ZMM18 |
0x4055c1 MOV 0xe0(%RSP),%RAX |
0x4055c9 IMUL %R8,%RAX |
0x4055cd MOV %RAX,0x280(%RSP) |
0x4055d5 VPXORD %XMM20,%XMM20,%XMM20 |
0x4055db XOR %ESI,%ESI |
0x4055dd MOV $0x4069f0,%R12 |
0x4055e4 MOV 0x128(%RSP),%R13 |
0x4055ec NOPL (%RAX) |
(23) 0x4055f0 VMOVUPS (%R13,%RSI,4),%ZMM0 |
(23) 0x4055f8 VSUBPS %ZMM18,%ZMM0,%ZMM0 |
(23) 0x4055fe CALL %R12 |
(23) 0x405601 VADDPS %ZMM20,%ZMM0,%ZMM20 |
(23) 0x405607 ADD $0x10,%RSI |
(23) 0x40560b CMP %RDI,%RSI |
(23) 0x40560e JLE 4055f0 |
0x405610 VEXTRACTF64X4 $0x1,%ZMM20,%YMM0 |
0x405617 VADDPS %ZMM0,%ZMM20,%ZMM0 |
0x40561d VEXTRACTF128 $0x1,%YMM0,%XMM1 |
0x405623 VADDPS %XMM1,%XMM0,%XMM0 |
0x405627 VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 |
0x40562c VADDPS %XMM1,%XMM0,%XMM0 |
0x405630 VMOVSHDUP %XMM0,%XMM1 |
0x405634 VADDSS %XMM1,%XMM0,%XMM20 |
0x40563a VPCMPEQQ %ZMM17,%ZMM19,%K0 |
0x405640 KMOVD %K0,%ECX |
0x405644 MOV 0x2c0(%RSP),%RDI |
0x40564c MOV %RDI,%RAX |
0x40564f TEST $0x1,%CL |
0x405652 MOV 0xa0(%RSP),%RSI |
0x40565a MOV 0x78(%RSP),%R12 |
0x40565f JE 40576a |
0x405665 TEST %RDI,%RDI |
0x405668 MOV 0x98(%RSP),%R13 |
0x405670 JE 40580e |
0x405676 MOV 0x118(%RSP),%EAX |
0x40567d LEA (%R12,%RAX,4),%RAX |
0x405681 MOV %RAX,0x80(%RSP) |
0x405689 LEA -0x1(%RDI),%RSI |
0x40568d VBROADCASTSS %XMM16,%ZMM18 |
0x405693 VBROADCASTSS %XMM20,%ZMM16 |
0x405699 VMOVSS 0x896b(%RIP),%XMM0 |
0x4056a1 VDIVSS %XMM20,%XMM0,%XMM0 |
0x4056a7 VBROADCASTSS %XMM0,%ZMM19 |
0x4056ad XOR %R12D,%R12D |
0x4056b0 MOV $0x4069f0,%R13 |
0x4056b7 MOV 0x128(%RSP),%RDI |
0x4056bf NOP |
(17) 0x4056c0 VMOVUPS (%RDI,%R12,4),%ZMM0 |
(17) 0x4056c7 VSUBPS %ZMM18,%ZMM0,%ZMM0 |
(17) 0x4056cd CALL %R13 |
(17) 0x4056d0 VMULPS %ZMM19,%ZMM0,%ZMM0 |
(17) 0x4056d6 MOV 0x80(%RSP),%RAX |
(17) 0x4056de VMOVUPS %ZMM0,(%RAX,%R12,4) |
(17) 0x4056e5 ADD $0x10,%R12 |
(17) 0x4056e9 CMP %RSI,%R12 |
(17) 0x4056ec JLE 4056c0 |
0x4056ee MOV 0x100(%RSP),%R8 |
0x4056f6 VPBROADCASTQ %R8,%ZMM0 |
0x4056fc VPCMPEQQ %ZMM17,%ZMM0,%K0 |
0x405702 KMOVD %K0,%EAX |
0x405706 TEST $0x1,%AL |
0x405708 MOV 0x98(%RSP),%R13 |
0x405710 MOV 0xa0(%RSP),%RSI |
0x405718 MOV 0x78(%RSP),%R12 |
0x40571d JE 405832 |
0x405723 CMP 0xe0(%RSP),%R8 |
0x40572b JAE 405480 |
0x405731 JMP 4058e5 |
0x405740 MOV 0xe0(%RSP),%RAX |
0x405748 IMUL %R8,%RAX |
0x40574c MOV %RAX,0x280(%RSP) |
0x405754 VBROADCASTSS %XMM16,%ZMM18 |
0x40575a VPXORD %XMM20,%XMM20,%XMM20 |
0x405760 XOR %EAX,%EAX |
0x405762 MOV 0xa0(%RSP),%RSI |
0x40576a VPBROADCASTQ %RAX,%ZMM0 |
0x405770 VPSUBQ %ZMM0,%ZMM19,%ZMM1 |
0x405776 VPSUBQ %ZMM0,%ZMM19,%ZMM0 |
0x40577c VPCMPNLEUQ 0x8a79(%RIP),%ZMM0,%K0 |
0x405787 VPCMPNLEUQ 0x8aae(%RIP),%ZMM1,%K1 |
0x405792 KUNPCKBW %K0,%K1,%K1 |
0x405796 KMOVW %K1,0x80(%RSP) |
0x40579f ADD 0x280(%RSP),%RAX |
0x4057a7 VMOVUPS (%RSI,%RAX,4),%ZMM0{%K1}{z} |
0x4057ae VMOVAPS %ZMM0,%ZMM21{%K1} |
0x4057b4 VSUBPS %ZMM18,%ZMM21,%ZMM1 |
0x4057ba CALL 406aa0 <__svml_expf16_mask_z0> |
0x4057c0 KMOVW 0x80(%RSP),%K1 |
0x4057c9 VMOVAPS %ZMM0,%ZMM0{%K1}{z} |
0x4057cf VEXTRACTF64X4 $0x1,%ZMM0,%YMM1 |
0x4057d6 VADDPS %ZMM1,%ZMM0,%ZMM0 |
0x4057dc VEXTRACTF128 $0x1,%YMM0,%XMM1 |
0x4057e2 VADDPS %XMM1,%XMM0,%XMM0 |
0x4057e6 VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 |
0x4057eb VADDPS %XMM1,%XMM0,%XMM0 |
0x4057ef VMOVSHDUP %XMM0,%XMM1 |
0x4057f3 VADDSS %XMM1,%XMM0,%XMM0 |
0x4057f7 VADDSS %XMM0,%XMM20,%XMM20 |
0x4057fd TEST %RDI,%RDI |
0x405800 MOV 0x98(%RSP),%R13 |
0x405808 JNE 405676 |
0x40580e MOV 0x100(%RSP),%RAX |
0x405816 VPBROADCASTQ %RAX,%ZMM0 |
0x40581c VBROADCASTSS %XMM16,%ZMM18 |
0x405822 VBROADCASTSS %XMM20,%ZMM16 |
0x405828 XOR %EAX,%EAX |
0x40582a MOV %RAX,0x2c0(%RSP) |
0x405832 MOV %R12,0x78(%RSP) |
0x405837 MOV 0x280(%RSP),%RCX |
0x40583f MOV %ECX,%EDI |
0x405841 MOV 0x2c0(%RSP),%R12 |
0x405849 VPBROADCASTQ %R12,%ZMM1 |
0x40584f VPSUBQ %ZMM1,%ZMM0,%ZMM2 |
0x405855 VPSUBQ %ZMM1,%ZMM0,%ZMM0 |
0x40585b VPCMPNLEUQ 0x899a(%RIP),%ZMM0,%K0 |
0x405866 VPCMPNLEUQ 0x89cf(%RIP),%ZMM2,%K1 |
0x405871 KUNPCKBW %K0,%K1,%K1 |
0x405875 KMOVW %K1,0x80(%RSP) |
0x40587e ADD %R12,%RCX |
0x405881 VMOVUPS (%RSI,%RCX,4),%ZMM0{%K1}{z} |
0x405888 VMOVAPS %ZMM0,%ZMM22{%K1} |
0x40588e VSUBPS %ZMM18,%ZMM22,%ZMM1 |
0x405894 CALL 406aa0 <__svml_expf16_mask_z0> |
0x40589a MOV 0x100(%RSP),%R8 |
0x4058a2 VRCP14PS %ZMM16,%ZMM1 |
0x4058a8 VFMSUB213PS 0x40e00c{1to16},%ZMM1,%ZMM16 |
0x4058b3 VMULPS %ZMM0,%ZMM1,%ZMM0 |
0x4058b9 VFNMADD213PS %ZMM0,%ZMM16,%ZMM0 |
0x4058bf ADD %R12,%RDI |
0x4058c2 MOV 0x78(%RSP),%RAX |
0x4058c7 KMOVW 0x80(%RSP),%K1 |
0x4058d0 VMOVUPS %ZMM0,(%RAX,%RDI,4){%K1} |
0x4058d7 CMP 0xe0(%RSP),%R8 |
0x4058df JAE 405480 |
0x4058e5 MOV 0x220(%RSP),%RDI |
0x4058ed MOV 0x150(%RSP),%RCX |
0x4058f5 IMUL %RCX,%RDI |
0x4058f9 ADD $0x4,%RDI |
0x4058fd MOV $0x3fffffffc,%RAX |
0x405907 AND %RAX,%RDI |
0x40590a MOV 0x230(%RSP),%RAX |
0x405912 SUB %RCX,%RAX |
0x405915 ADD 0x78(%RSP),%RDI |
0x40591a MOV %EAX,%EAX |
0x40591c LEA 0x4(,%RAX,4),%RDX |
0x405924 XOR %ESI,%ESI |
0x405926 VMOVUPS %ZMM21,0x2c0(%RSP) |
0x40592e VMOVUPS %ZMM22,0x280(%RSP) |
0x405936 VMOVUPS %XMM20,0x80(%RSP) |
0x40593e VZEROUPPER |
0x405941 CALL 406bf0 <_intel_fast_memset> |
0x405946 VMOVUPS 0x80(%RSP),%XMM20 |
0x40594e MOV 0x100(%RSP),%R8 |
0x405956 VMOVUPS 0x280(%RSP),%ZMM22 |
0x40595e VMOVUPS 0x2c0(%RSP),%ZMM21 |
0x405966 JMP 405480 |
/home/eoseret/Applications/llm-attention/attention_v2.cpp: 43 - 273 |
-------------------------------------------------------------------------------- |
43: for (int row = 0; row < N; ++row) { |
44: const float *S_row = &S[row * N]; |
45: |
46: float max_val = -FLT_MAX; |
47: for (int idx = 0; idx <= row; ++idx) // vectorised |
48: if (S_row[idx] > max_val) max_val = S_row[idx]; |
49: |
50: float sum = 0.0f; |
51: #pragma clang loop vectorize(enable) |
52: for (int idx = 0; idx <= row; ++idx) // vectorised |
53: sum += expf(S_row[idx] - max_val); |
54: |
55: for (int idx = 0; idx <= row; ++idx) //vectorised |
56: P[row * N + idx] = expf(S_row[idx] - max_val) / sum; |
57: |
58: for (int idx = row + 1; idx < N; ++idx) |
59: P[row * N + idx] = 0.0f; |
60: |
61: D[row] = sum; |
[...] |
98: if (argc < 4) { |
[...] |
273: start = std::chrono::steady_clock::now(); |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | __libc_init_first | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | attention-icx-skl512 |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 2.08 |
| CQA speedup if FP arith vectorized | 1.04 |
| CQA speedup if fully vectorized | 1.38 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.77 |
| Bottlenecks | micro-operation queue, |
| Function | main |
| Source | attention_v2.cpp:43-44,attention_v2.cpp:47-47,attention_v2.cpp:52-61,attention_v2.cpp:98-98,attention_v2.cpp:273-273 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 47.75 |
| CQA cycles if no scalar integer | 23.00 |
| CQA cycles if FP arith vectorized | 45.99 |
| CQA cycles if fully vectorized | 34.59 |
| Front-end cycles | 47.75 |
| P0 cycles | 25.00 |
| P1 cycles | 25.00 |
| P2 cycles | 24.50 |
| P3 cycles | 24.50 |
| P4 cycles | 22.00 |
| P5 cycles | 27.00 |
| P6 cycles | 25.00 |
| P7 cycles | 22.00 |
| DIV/SQRT cycles | 3.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 183.00 |
| Nb uops | 191.00 |
| Nb loads | 53.00 |
| Nb stores | 19.00 |
| Nb stack references | 19.00 |
| FLOP/cycle | 3.77 |
| Nb FLOP add-sub | 83.00 |
| Nb FLOP mul | 16.00 |
| Nb FLOP fma | 32.00 |
| Nb FLOP div | 1.00 |
| Nb FLOP rcp | 16.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 24.21 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 844.00 |
| Bytes stored | 312.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 48.91 |
| Vectorization ratio load | 47.62 |
| Vectorization ratio store | 23.53 |
| Vectorization ratio mul | 100.00 |
| Vectorization ratio add_sub | 75.00 |
| Vectorization ratio fma | 100.00 |
| Vectorization ratio div_sqrt | 50.00 |
| Vectorization ratio other | 48.78 |
| Vector-efficiency ratio all | 40.63 |
| Vector-efficiency ratio load | 49.70 |
| Vector-efficiency ratio store | 28.31 |
| Vector-efficiency ratio mul | 100.00 |
| Vector-efficiency ratio add_sub | 58.20 |
| Vector-efficiency ratio fma | 100.00 |
| Vector-efficiency ratio div_sqrt | 53.13 |
| Vector-efficiency ratio other | 34.45 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 2.08 |
| CQA speedup if FP arith vectorized | 1.04 |
| CQA speedup if fully vectorized | 1.38 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.77 |
| Bottlenecks | micro-operation queue, |
| Function | main |
| Source | attention_v2.cpp:43-44,attention_v2.cpp:47-47,attention_v2.cpp:52-61,attention_v2.cpp:98-98,attention_v2.cpp:273-273 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 47.75 |
| CQA cycles if no scalar integer | 23.00 |
| CQA cycles if FP arith vectorized | 45.99 |
| CQA cycles if fully vectorized | 34.59 |
| Front-end cycles | 47.75 |
| P0 cycles | 25.00 |
| P1 cycles | 25.00 |
| P2 cycles | 24.50 |
| P3 cycles | 24.50 |
| P4 cycles | 22.00 |
| P5 cycles | 27.00 |
| P6 cycles | 25.00 |
| P7 cycles | 22.00 |
| DIV/SQRT cycles | 3.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 183.00 |
| Nb uops | 191.00 |
| Nb loads | 53.00 |
| Nb stores | 19.00 |
| Nb stack references | 19.00 |
| FLOP/cycle | 3.77 |
| Nb FLOP add-sub | 83.00 |
| Nb FLOP mul | 16.00 |
| Nb FLOP fma | 32.00 |
| Nb FLOP div | 1.00 |
| Nb FLOP rcp | 16.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 24.21 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 844.00 |
| Bytes stored | 312.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 48.91 |
| Vectorization ratio load | 47.62 |
| Vectorization ratio store | 23.53 |
| Vectorization ratio mul | 100.00 |
| Vectorization ratio add_sub | 75.00 |
| Vectorization ratio fma | 100.00 |
| Vectorization ratio div_sqrt | 50.00 |
| Vectorization ratio other | 48.78 |
| Vector-efficiency ratio all | 40.63 |
| Vector-efficiency ratio load | 49.70 |
| Vector-efficiency ratio store | 28.31 |
| Vector-efficiency ratio mul | 100.00 |
| Vector-efficiency ratio add_sub | 58.20 |
| Vector-efficiency ratio fma | 100.00 |
| Vector-efficiency ratio div_sqrt | 53.13 |
| Vector-efficiency ratio other | 34.45 |
| Path / |
| Function | main |
| Source file and lines | attention_v2.cpp:43-273 |
| Module | attention-icx-skl512 |
| nb instructions | 183 |
| nb uops | 191 |
| loop length | 1104 |
| used x86 registers | 10 |
| used mmx registers | 0 |
| used xmm registers | 4 |
| used ymm registers | 2 |
| used zmm registers | 10 |
| nb stack references | 19 |
| ADD-SUB / MUL ratio | 11.00 |
| micro-operation queue | 47.75 cycles |
| front end | 47.75 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
|---|---|---|---|---|---|---|---|---|
| uops | 25.00 | 25.00 | 24.50 | 24.50 | 22.00 | 27.00 | 25.00 | 22.00 |
| cycles | 25.00 | 25.00 | 24.50 | 24.50 | 22.00 | 27.00 | 25.00 | 22.00 |
| Cycles executing div or sqrt instructions | 3.00 |
| Front-end | 47.75 |
| Dispatch | 27.00 |
| DIV/SQRT | 3.00 |
| Overall L1 | 47.75 |
| all | 28% |
| load | 33% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 80% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 39% |
| all | 68% |
| load | 66% |
| store | 80% |
| mul | 100% |
| add-sub | 72% |
| fma | 100% |
| div/sqrt | 50% |
| other | 61% |
| all | 48% |
| load | 47% |
| store | 23% |
| mul | 100% |
| add-sub | 75% |
| fma | 100% |
| div/sqrt | 50% |
| other | 48% |
| all | 32% |
| load | 41% |
| store | 12% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 82% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 36% |
| all | 48% |
| load | 60% |
| store | 66% |
| mul | 100% |
| add-sub | 47% |
| fma | 100% |
| div/sqrt | 53% |
| other | 31% |
| all | 40% |
| load | 49% |
| store | 28% |
| mul | 100% |
| add-sub | 58% |
| fma | 100% |
| div/sqrt | 53% |
| other | 34% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOV 0x130(%RSP),%R12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV 0x150(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VMOVSS %XMM20,(%R12,%RCX,4) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (6.3%) |
| MOV 0x1b8(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV 0x250(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| ADD %RAX,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0x248(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| INC %RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0x128(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| ADD %RAX,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0x118(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| ADD 0x194(%RSP),%EAX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| MOV %RAX,0x118(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| CMP 0x188(%RSP),%RCX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| JE 405970 <main+0x26c0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| LEA 0x1(%R8),%R9 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| VMOVSS 0x8b15(%RIP),%XMM16 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (6.3%) |
| CMP $0x8,%R9 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| JB 405544 <main+0x2294> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV %RSI,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| SHR $0x3,%RAX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| MOV %RDX,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VMOVSS 0x8afb(%RIP),%XMM16 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (6.3%) |
| NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV %R9,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| AND $-0x8,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| CMP %R9,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0x78(%RSP),%R12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| JE 40556f <main+0x22bf> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV %RDI,0x128(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %RSI,0x248(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %RDX,0x250(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %R9,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| AND $-0x10,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| VPBROADCASTQ %RDI,%ZMM17 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| MOV %R8,0x150(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %R9,0x100(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %RDI,0x2c0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| VPBROADCASTQ %R9,%ZMM19 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| JE 405740 <main+0x2490> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| DEC %RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| VBROADCASTSS %XMM16,%ZMM18 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| MOV 0xe0(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| IMUL %R8,%RAX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| MOV %RAX,0x280(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| VPXORD %XMM20,%XMM20,%XMM20 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 | vect (25.0%) |
| XOR %ESI,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV $0x4069f0,%R12 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0x128(%RSP),%R13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VEXTRACTF64X4 $0x1,%ZMM20,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (50.0%) |
| VADDPS %ZMM0,%ZMM20,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VEXTRACTF128 $0x1,%YMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (25.0%) |
| VADDPS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VMOVSHDUP %XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (12.5%) |
| VADDSS %XMM1,%XMM0,%XMM20 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| VPCMPEQQ %ZMM17,%ZMM19,%K0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (100.0%) |
| KMOVD %K0,%ECX | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | N/A |
| MOV 0x2c0(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV %RDI,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| TEST $0x1,%CL | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0xa0(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV 0x78(%RSP),%R12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| JE 40576a <main+0x24ba> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| TEST %RDI,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0x98(%RSP),%R13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| JE 40580e <main+0x255e> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV 0x118(%RSP),%EAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| LEA (%R12,%RAX,4),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| MOV %RAX,0x80(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| LEA -0x1(%RDI),%RSI | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| VBROADCASTSS %XMM16,%ZMM18 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| VBROADCASTSS %XMM20,%ZMM16 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| VMOVSS 0x896b(%RIP),%XMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (6.3%) |
| VDIVSS %XMM20,%XMM0,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | 3 | scal (6.3%) |
| VBROADCASTSS %XMM0,%ZMM19 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| XOR %R12D,%R12D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV $0x4069f0,%R13 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| MOV 0x128(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| NOP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV 0x100(%RSP),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VPBROADCASTQ %R8,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| VPCMPEQQ %ZMM17,%ZMM0,%K0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (100.0%) |
| KMOVD %K0,%EAX | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | N/A |
| TEST $0x1,%AL | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0x98(%RSP),%R13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| MOV 0xa0(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV 0x78(%RSP),%R12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| JE 405832 <main+0x2582> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| CMP 0xe0(%RSP),%R8 | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| JAE 405480 <main+0x21d0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| JMP 4058e5 <main+0x2635> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| MOV 0xe0(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| IMUL %R8,%RAX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| MOV %RAX,0x280(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| VBROADCASTSS %XMM16,%ZMM18 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| VPXORD %XMM20,%XMM20,%XMM20 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 | vect (25.0%) |
| XOR %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV 0xa0(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VPBROADCASTQ %RAX,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| VPSUBQ %ZMM0,%ZMM19,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | vect (100.0%) |
| VPSUBQ %ZMM0,%ZMM19,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | vect (100.0%) |
| VPCMPNLEUQ 0x8a79(%RIP),%ZMM0,%K0 | vect (100.0%) | |||||||||||
| VPCMPNLEUQ 0x8aae(%RIP),%ZMM1,%K1 | vect (100.0%) | |||||||||||
| KUNPCKBW %K0,%K1,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 4 | 1 | N/A |
| KMOVW %K1,0x80(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 4 | 1 | N/A |
| ADD 0x280(%RSP),%RAX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| VMOVUPS (%RSI,%RAX,4),%ZMM0{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 | vect (100.0%) |
| VMOVAPS %ZMM0,%ZMM21{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (100.0%) |
| VSUBPS %ZMM18,%ZMM21,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| CALL 406aa0 <__svml_expf16_mask_z0> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 | N/A |
| KMOVW 0x80(%RSP),%K1 | 3 | 0 | 0 | 0.50 | 0.50 | 0 | 1 | 0 | 0 | 2 | 1 | N/A |
| VMOVAPS %ZMM0,%ZMM0{%K1}{z} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (100.0%) |
| VEXTRACTF64X4 $0x1,%ZMM0,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (50.0%) |
| VADDPS %ZMM1,%ZMM0,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VEXTRACTF128 $0x1,%YMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (25.0%) |
| VADDPS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VMOVSHDUP %XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (12.5%) |
| VADDSS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| VADDSS %XMM0,%XMM20,%XMM20 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| TEST %RDI,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0x98(%RSP),%R13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| JNE 405676 <main+0x23c6> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV 0x100(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VPBROADCASTQ %RAX,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| VBROADCASTSS %XMM16,%ZMM18 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| VBROADCASTSS %XMM20,%ZMM16 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| XOR %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV %RAX,0x2c0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %R12,0x78(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV 0x280(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV %ECX,%EDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV 0x2c0(%RSP),%R12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VPBROADCASTQ %R12,%ZMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| VPSUBQ %ZMM1,%ZMM0,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | vect (100.0%) |
| VPSUBQ %ZMM1,%ZMM0,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | vect (100.0%) |
| VPCMPNLEUQ 0x899a(%RIP),%ZMM0,%K0 | vect (100.0%) | |||||||||||
| VPCMPNLEUQ 0x89cf(%RIP),%ZMM2,%K1 | vect (100.0%) | |||||||||||
| KUNPCKBW %K0,%K1,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 4 | 1 | N/A |
| KMOVW %K1,0x80(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 4 | 1 | N/A |
| ADD %R12,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| VMOVUPS (%RSI,%RCX,4),%ZMM0{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 | vect (100.0%) |
| VMOVAPS %ZMM0,%ZMM22{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (100.0%) |
| VSUBPS %ZMM18,%ZMM22,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| CALL 406aa0 <__svml_expf16_mask_z0> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 | N/A |
| MOV 0x100(%RSP),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VRCP14PS %ZMM16,%ZMM1 | 3 | 2.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 7-8 | 2 | vect (100.0%) |
| VFMSUB213PS 0x40e00c{1to16},%ZMM1,%ZMM16 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VMULPS %ZMM0,%ZMM1,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VFNMADD213PS %ZMM0,%ZMM16,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| ADD %R12,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0x78(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| KMOVW 0x80(%RSP),%K1 | 3 | 0 | 0 | 0.50 | 0.50 | 0 | 1 | 0 | 0 | 2 | 1 | N/A |
| VMOVUPS %ZMM0,(%RAX,%RDI,4){%K1} | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | vect (100.0%) |
| CMP 0xe0(%RSP),%R8 | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| JAE 405480 <main+0x21d0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV 0x220(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV 0x150(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| IMUL %RCX,%RDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| ADD $0x4,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV $0x3fffffffc,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| AND %RAX,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0x230(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| SUB %RCX,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| ADD 0x78(%RSP),%RDI | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| MOV %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| LEA 0x4(,%RAX,4),%RDX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| XOR %ESI,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VMOVUPS %ZMM21,0x2c0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | vect (100.0%) |
| VMOVUPS %ZMM22,0x280(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | vect (100.0%) |
| VMOVUPS %XMM20,0x80(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | vect (25.0%) |
| VZEROUPPER | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | vect (25.0%) |
| CALL 406bf0 <_intel_fast_memset> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 | N/A |
| VMOVUPS 0x80(%RSP),%XMM20 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | vect (25.0%) |
| MOV 0x100(%RSP),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VMOVUPS 0x280(%RSP),%ZMM22 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 | vect (100.0%) |
| VMOVUPS 0x2c0(%RSP),%ZMM21 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 | vect (100.0%) |
| JMP 405480 <main+0x21d0> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| Function | main |
| Source file and lines | attention_v2.cpp:43-273 |
| Module | attention-icx-skl512 |
| nb instructions | 183 |
| nb uops | 191 |
| loop length | 1104 |
| used x86 registers | 10 |
| used mmx registers | 0 |
| used xmm registers | 4 |
| used ymm registers | 2 |
| used zmm registers | 10 |
| nb stack references | 19 |
| ADD-SUB / MUL ratio | 11.00 |
| micro-operation queue | 47.75 cycles |
| front end | 47.75 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
|---|---|---|---|---|---|---|---|---|
| uops | 25.00 | 25.00 | 24.50 | 24.50 | 22.00 | 27.00 | 25.00 | 22.00 |
| cycles | 25.00 | 25.00 | 24.50 | 24.50 | 22.00 | 27.00 | 25.00 | 22.00 |
| Cycles executing div or sqrt instructions | 3.00 |
| Front-end | 47.75 |
| Dispatch | 27.00 |
| DIV/SQRT | 3.00 |
| Overall L1 | 47.75 |
| all | 28% |
| load | 33% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 80% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 39% |
| all | 68% |
| load | 66% |
| store | 80% |
| mul | 100% |
| add-sub | 72% |
| fma | 100% |
| div/sqrt | 50% |
| other | 61% |
| all | 48% |
| load | 47% |
| store | 23% |
| mul | 100% |
| add-sub | 75% |
| fma | 100% |
| div/sqrt | 50% |
| other | 48% |
| all | 32% |
| load | 41% |
| store | 12% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 82% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 36% |
| all | 48% |
| load | 60% |
| store | 66% |
| mul | 100% |
| add-sub | 47% |
| fma | 100% |
| div/sqrt | 53% |
| other | 31% |
| all | 40% |
| load | 49% |
| store | 28% |
| mul | 100% |
| add-sub | 58% |
| fma | 100% |
| div/sqrt | 53% |
| other | 34% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOV 0x130(%RSP),%R12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV 0x150(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VMOVSS %XMM20,(%R12,%RCX,4) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (6.3%) |
| MOV 0x1b8(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV 0x250(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| ADD %RAX,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0x248(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| INC %RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0x128(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| ADD %RAX,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0x118(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| ADD 0x194(%RSP),%EAX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| MOV %RAX,0x118(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| CMP 0x188(%RSP),%RCX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| JE 405970 <main+0x26c0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| LEA 0x1(%R8),%R9 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| VMOVSS 0x8b15(%RIP),%XMM16 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (6.3%) |
| CMP $0x8,%R9 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| JB 405544 <main+0x2294> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV %RSI,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| SHR $0x3,%RAX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| MOV %RDX,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VMOVSS 0x8afb(%RIP),%XMM16 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (6.3%) |
| NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV %R9,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| AND $-0x8,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| CMP %R9,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0x78(%RSP),%R12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| JE 40556f <main+0x22bf> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV %RDI,0x128(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %RSI,0x248(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %RDX,0x250(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %R9,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| AND $-0x10,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| VPBROADCASTQ %RDI,%ZMM17 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| MOV %R8,0x150(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %R9,0x100(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %RDI,0x2c0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| VPBROADCASTQ %R9,%ZMM19 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| JE 405740 <main+0x2490> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| DEC %RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| VBROADCASTSS %XMM16,%ZMM18 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| MOV 0xe0(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| IMUL %R8,%RAX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| MOV %RAX,0x280(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| VPXORD %XMM20,%XMM20,%XMM20 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 | vect (25.0%) |
| XOR %ESI,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV $0x4069f0,%R12 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0x128(%RSP),%R13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VEXTRACTF64X4 $0x1,%ZMM20,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (50.0%) |
| VADDPS %ZMM0,%ZMM20,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VEXTRACTF128 $0x1,%YMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (25.0%) |
| VADDPS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VMOVSHDUP %XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (12.5%) |
| VADDSS %XMM1,%XMM0,%XMM20 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| VPCMPEQQ %ZMM17,%ZMM19,%K0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (100.0%) |
| KMOVD %K0,%ECX | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | N/A |
| MOV 0x2c0(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV %RDI,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| TEST $0x1,%CL | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0xa0(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV 0x78(%RSP),%R12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| JE 40576a <main+0x24ba> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| TEST %RDI,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0x98(%RSP),%R13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| JE 40580e <main+0x255e> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV 0x118(%RSP),%EAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| LEA (%R12,%RAX,4),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| MOV %RAX,0x80(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| LEA -0x1(%RDI),%RSI | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| VBROADCASTSS %XMM16,%ZMM18 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| VBROADCASTSS %XMM20,%ZMM16 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| VMOVSS 0x896b(%RIP),%XMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (6.3%) |
| VDIVSS %XMM20,%XMM0,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | 3 | scal (6.3%) |
| VBROADCASTSS %XMM0,%ZMM19 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| XOR %R12D,%R12D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV $0x4069f0,%R13 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| MOV 0x128(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| NOP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV 0x100(%RSP),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VPBROADCASTQ %R8,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| VPCMPEQQ %ZMM17,%ZMM0,%K0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (100.0%) |
| KMOVD %K0,%EAX | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | N/A |
| TEST $0x1,%AL | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0x98(%RSP),%R13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| MOV 0xa0(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV 0x78(%RSP),%R12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| JE 405832 <main+0x2582> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| CMP 0xe0(%RSP),%R8 | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| JAE 405480 <main+0x21d0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| JMP 4058e5 <main+0x2635> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| MOV 0xe0(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| IMUL %R8,%RAX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| MOV %RAX,0x280(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| VBROADCASTSS %XMM16,%ZMM18 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| VPXORD %XMM20,%XMM20,%XMM20 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 | vect (25.0%) |
| XOR %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV 0xa0(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VPBROADCASTQ %RAX,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| VPSUBQ %ZMM0,%ZMM19,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | vect (100.0%) |
| VPSUBQ %ZMM0,%ZMM19,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | vect (100.0%) |
| VPCMPNLEUQ 0x8a79(%RIP),%ZMM0,%K0 | vect (100.0%) | |||||||||||
| VPCMPNLEUQ 0x8aae(%RIP),%ZMM1,%K1 | vect (100.0%) | |||||||||||
| KUNPCKBW %K0,%K1,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 4 | 1 | N/A |
| KMOVW %K1,0x80(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 4 | 1 | N/A |
| ADD 0x280(%RSP),%RAX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| VMOVUPS (%RSI,%RAX,4),%ZMM0{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 | vect (100.0%) |
| VMOVAPS %ZMM0,%ZMM21{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (100.0%) |
| VSUBPS %ZMM18,%ZMM21,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| CALL 406aa0 <__svml_expf16_mask_z0> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 | N/A |
| KMOVW 0x80(%RSP),%K1 | 3 | 0 | 0 | 0.50 | 0.50 | 0 | 1 | 0 | 0 | 2 | 1 | N/A |
| VMOVAPS %ZMM0,%ZMM0{%K1}{z} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (100.0%) |
| VEXTRACTF64X4 $0x1,%ZMM0,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (50.0%) |
| VADDPS %ZMM1,%ZMM0,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VEXTRACTF128 $0x1,%YMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (25.0%) |
| VADDPS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VMOVSHDUP %XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (12.5%) |
| VADDSS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| VADDSS %XMM0,%XMM20,%XMM20 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| TEST %RDI,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0x98(%RSP),%R13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| JNE 405676 <main+0x23c6> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV 0x100(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VPBROADCASTQ %RAX,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| VBROADCASTSS %XMM16,%ZMM18 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| VBROADCASTSS %XMM20,%ZMM16 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| XOR %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV %RAX,0x2c0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %R12,0x78(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV 0x280(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV %ECX,%EDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV 0x2c0(%RSP),%R12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VPBROADCASTQ %R12,%ZMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| VPSUBQ %ZMM1,%ZMM0,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | vect (100.0%) |
| VPSUBQ %ZMM1,%ZMM0,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | vect (100.0%) |
| VPCMPNLEUQ 0x899a(%RIP),%ZMM0,%K0 | vect (100.0%) | |||||||||||
| VPCMPNLEUQ 0x89cf(%RIP),%ZMM2,%K1 | vect (100.0%) | |||||||||||
| KUNPCKBW %K0,%K1,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 4 | 1 | N/A |
| KMOVW %K1,0x80(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 4 | 1 | N/A |
| ADD %R12,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| VMOVUPS (%RSI,%RCX,4),%ZMM0{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 | vect (100.0%) |
| VMOVAPS %ZMM0,%ZMM22{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (100.0%) |
| VSUBPS %ZMM18,%ZMM22,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| CALL 406aa0 <__svml_expf16_mask_z0> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 | N/A |
| MOV 0x100(%RSP),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VRCP14PS %ZMM16,%ZMM1 | 3 | 2.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 7-8 | 2 | vect (100.0%) |
| VFMSUB213PS 0x40e00c{1to16},%ZMM1,%ZMM16 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VMULPS %ZMM0,%ZMM1,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VFNMADD213PS %ZMM0,%ZMM16,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| ADD %R12,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0x78(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| KMOVW 0x80(%RSP),%K1 | 3 | 0 | 0 | 0.50 | 0.50 | 0 | 1 | 0 | 0 | 2 | 1 | N/A |
| VMOVUPS %ZMM0,(%RAX,%RDI,4){%K1} | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | vect (100.0%) |
| CMP 0xe0(%RSP),%R8 | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| JAE 405480 <main+0x21d0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV 0x220(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV 0x150(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| IMUL %RCX,%RDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| ADD $0x4,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV $0x3fffffffc,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| AND %RAX,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0x230(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| SUB %RCX,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| ADD 0x78(%RSP),%RDI | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| MOV %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| LEA 0x4(,%RAX,4),%RDX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| XOR %ESI,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VMOVUPS %ZMM21,0x2c0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | vect (100.0%) |
| VMOVUPS %ZMM22,0x280(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | vect (100.0%) |
| VMOVUPS %XMM20,0x80(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | vect (25.0%) |
| VZEROUPPER | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | vect (25.0%) |
| CALL 406bf0 <_intel_fast_memset> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 | N/A |
| VMOVUPS 0x80(%RSP),%XMM20 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | vect (25.0%) |
| MOV 0x100(%RSP),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VMOVUPS 0x280(%RSP),%ZMM22 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 | vect (100.0%) |
| VMOVUPS 0x2c0(%RSP),%ZMM21 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 | vect (100.0%) |
| JMP 405480 <main+0x21d0> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
