| Loop Id: 42 | Module: attention-avx512 | Source: attention.cpp:43-284 [...] | Coverage: 0.48% |
|---|
| Loop Id: 42 | Module: attention-avx512 | Source: attention.cpp:43-284 [...] | Coverage: 0.48% |
|---|
0x4860 MOV 0xb8(%RSP),%RAX |
0x4868 VMOVAPS 0x80(%RSP),%XMM0 |
0x4871 VMOVSS %XMM0,(%RAX,%R13,4) |
0x4877 MOV 0xc0(%RSP),%RDX |
0x487f INC %RDX |
0x4882 MOV 0x20(%RSP),%RCX |
0x4887 LEA (,%RCX,4),%RAX |
0x488f MOV 0x170(%RSP),%R8 |
0x4897 ADD %RAX,%R8 |
0x489a ADD %RAX,%RDI |
0x489d MOV 0xd8(%RSP),%RSI |
0x48a5 ADD %RAX,%RSI |
0x48a8 MOV %RSI,%RAX |
0x48ab MOV %R15,0x118(%RSP) |
0x48b3 CMP %RCX,%R15 |
0x48b6 MOV 0x50(%RSP),%R15 |
0x48bb MOV 0x48(%RSP),%R13 |
0x48c0 MOV 0x110(%RSP),%RBX |
0x48c8 JE 4810 |
0x48ce MOV %RAX,0xd8(%RSP) |
0x48d6 MOV %RDX,%RSI |
0x48d9 AND $-0x8,%RSI |
0x48dd CMP $0x8,%RDX |
0x48e1 MOV %RDX,%R15 |
0x48e4 JAE 4900 |
0x48e6 XOR %EAX,%EAX |
0x48e8 VMOVSS 0x1714(%RIP),%XMM1 |
0x48f0 JMP 4a20 |
0x4900 CMP $0x40,%R15 |
0x4904 JAE 4920 |
0x4906 XOR %EAX,%EAX |
0x4908 VMOVSS 0x16f4(%RIP),%XMM1 |
0x4910 JMP 49cb |
0x4920 MOV %R15,%RCX |
0x4923 AND $-0x40,%RCX |
0x4927 MOV $0x7ffffffffffffff8,%RAX |
0x4931 ADD $-0x38,%RAX |
0x4935 AND %R15,%RAX |
0x4938 XOR %EDX,%EDX |
0x493a VBROADCASTSS 0x16c0(%RIP),%ZMM3 |
0x4944 VMOVAPS %ZMM3,%ZMM0 |
0x494a VMOVAPS %ZMM3,%ZMM1 |
0x4950 VMOVAPS %ZMM3,%ZMM2 |
0x4956 NOPW %CS:(%RAX,%RAX,1) |
(37) 0x4960 VMAXPS -0xc0(%R8,%RDX,4),%ZMM0,%ZMM0 |
(37) 0x4968 VMAXPS -0x80(%R8,%RDX,4),%ZMM1,%ZMM1 |
(37) 0x4970 VMAXPS -0x40(%R8,%RDX,4),%ZMM2,%ZMM2 |
(37) 0x4978 VMAXPS (%R8,%RDX,4),%ZMM3,%ZMM3 |
(37) 0x497f ADD $0x40,%RDX |
(37) 0x4983 CMP %RDX,%RCX |
(37) 0x4986 JNE 4960 |
0x4988 VMAXPS %ZMM1,%ZMM0,%ZMM0 |
0x498e VMAXPS %ZMM3,%ZMM2,%ZMM1 |
0x4994 VMAXPS %ZMM1,%ZMM0,%ZMM0 |
0x499a VEXTRACTF64X4 $0x1,%ZMM0,%YMM1 |
0x49a1 VMAXPS %YMM1,%YMM0,%YMM0 |
0x49a5 VEXTRACTF128 $0x1,%YMM0,%XMM1 |
0x49ab VMAXPS %XMM1,%XMM0,%XMM0 |
0x49af VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 |
0x49b4 VMAXPS %XMM1,%XMM0,%XMM0 |
0x49b8 VMOVSHDUP %XMM0,%XMM1 |
0x49bc VMAXSS %XMM1,%XMM0,%XMM1 |
0x49c0 CMP %RAX,%R15 |
0x49c3 JE 4a2d |
0x49c5 TEST $0x38,%R15B |
0x49c9 JE 4a20 |
0x49cb MOV %RAX,%RCX |
0x49ce MOV %R15,%RAX |
0x49d1 MOV $0x7ffffffffffffff8,%RDX |
0x49db AND %RDX,%RAX |
0x49de VBROADCASTSS %XMM1,%YMM0 |
0x49e3 NOPW %CS:(%RAX,%RAX,1) |
(46) 0x49f0 VMAXPS (%RDI,%RCX,4),%YMM0,%YMM0 |
(46) 0x49f5 ADD $0x8,%RCX |
(46) 0x49f9 CMP %RCX,%RSI |
(46) 0x49fc JNE 49f0 |
0x49fe VEXTRACTF128 $0x1,%YMM0,%XMM1 |
0x4a04 VMAXPS %XMM1,%XMM0,%XMM0 |
0x4a08 VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 |
0x4a0d VMAXPS %XMM1,%XMM0,%XMM0 |
0x4a11 VMOVSHDUP %XMM0,%XMM1 |
0x4a15 VMAXSS %XMM1,%XMM0,%XMM1 |
0x4a19 JMP 4a28 |
(45) 0x4a20 VMAXSS (%RDI,%RAX,4),%XMM1,%XMM1 |
(45) 0x4a25 INC %RAX |
(45) 0x4a28 CMP %RAX,%R15 |
(45) 0x4a2b JNE 4a20 |
0x4a2d CMP $0x4,%R15 |
0x4a31 MOV %R15,0xc0(%RSP) |
0x4a39 MOV %RDI,0x10(%RSP) |
0x4a3e VMOVAPS %XMM1,0x30(%RSP) |
0x4a44 MOV %R8,0x170(%RSP) |
0x4a4c MOV %RSI,0x40(%RSP) |
0x4a51 JAE 4a60 |
0x4a53 VXORPS %XMM2,%XMM2,%XMM2 |
0x4a57 XOR %R13D,%R13D |
0x4a5a JMP 4cd0 |
0x4a60 CMP $0x20,%R15 |
0x4a64 JAE 4a80 |
0x4a66 VXORPS %XMM2,%XMM2,%XMM2 |
0x4a6a XOR %R13D,%R13D |
0x4a6d JMP 4c36 |
0x4a80 MOV %R15,%RAX |
0x4a83 AND $-0x20,%RAX |
0x4a87 MOV %RAX,0x178(%RSP) |
0x4a8f MOV $0x7ffffffffffffff8,%RAX |
0x4a99 LEA -0x18(%RAX),%R13 |
0x4a9d AND %R15,%R13 |
0x4aa0 VBROADCASTSS %XMM1,%YMM0 |
0x4aa5 VMOVAPS %YMM0,0x220(%RSP) |
0x4aae VXORPS %XMM1,%XMM1,%XMM1 |
0x4ab2 XOR %R15D,%R15D |
0x4ab5 VXORPS %XMM2,%XMM2,%XMM2 |
0x4ab9 VXORPS %XMM3,%XMM3,%XMM3 |
0x4abd VPXOR %XMM4,%XMM4,%XMM4 |
0x4ac1 NOPW %CS:(%RAX,%RAX,1) |
(38) 0x4ad0 VMOVDQA %YMM4,0x200(%RSP) |
(38) 0x4ad9 VMOVAPS %YMM3,0x120(%RSP) |
(38) 0x4ae2 VMOVAPS %YMM2,0xe0(%RSP) |
(38) 0x4aeb VMOVAPS %YMM1,0x80(%RSP) |
(38) 0x4af4 VMOVUPS (%RDI,%R15,4),%YMM0 |
(38) 0x4afa VMOVUPS 0x20(%RDI,%R15,4),%YMM1 |
(38) 0x4b01 VMOVUPS 0x40(%RDI,%R15,4),%YMM2 |
(38) 0x4b08 VMOVUPS 0x60(%RDI,%R15,4),%YMM3 |
(38) 0x4b0f VMOVAPS 0x220(%RSP),%YMM4 |
(38) 0x4b18 VSUBPS %YMM4,%YMM0,%YMM0 |
(38) 0x4b1c VSUBPS %YMM4,%YMM1,%YMM1 |
(38) 0x4b20 VMOVAPS %YMM1,0x1a0(%RSP) |
(38) 0x4b29 VSUBPS %YMM4,%YMM2,%YMM1 |
(38) 0x4b2d VMOVAPS %YMM1,0x180(%RSP) |
(38) 0x4b36 VSUBPS %YMM4,%YMM3,%YMM1 |
(38) 0x4b3a VMOVAPS %YMM1,0x1e0(%RSP) |
(38) 0x4b43 CALL 10d0 <_ZGVdN8v_expf@plt> |
(38) 0x4b48 VMOVAPS %YMM0,0x1c0(%RSP) |
(38) 0x4b51 VMOVAPS 0x1a0(%RSP),%YMM0 |
(38) 0x4b5a CALL 10d0 <_ZGVdN8v_expf@plt> |
(38) 0x4b5f VMOVAPS %YMM0,0x1a0(%RSP) |
(38) 0x4b68 VMOVAPS 0x180(%RSP),%YMM0 |
(38) 0x4b71 CALL 10d0 <_ZGVdN8v_expf@plt> |
(38) 0x4b76 VMOVAPS %YMM0,0x180(%RSP) |
(38) 0x4b7f VMOVAPS 0x1e0(%RSP),%YMM0 |
(38) 0x4b88 CALL 10d0 <_ZGVdN8v_expf@plt> |
(38) 0x4b8d VMOVAPS 0x200(%RSP),%YMM4 |
(38) 0x4b96 VMOVAPS 0x120(%RSP),%YMM3 |
(38) 0x4b9f VMOVAPS 0xe0(%RSP),%YMM2 |
(38) 0x4ba8 VMOVAPS 0x80(%RSP),%YMM1 |
(38) 0x4bb1 MOV 0x10(%RSP),%RDI |
(38) 0x4bb6 VADDPS 0x1c0(%RSP),%YMM1,%YMM1 |
(38) 0x4bbf VADDPS 0x1a0(%RSP),%YMM2,%YMM2 |
(38) 0x4bc8 VADDPS 0x180(%RSP),%YMM3,%YMM3 |
(38) 0x4bd1 VADDPS %YMM4,%YMM0,%YMM4 |
(38) 0x4bd5 ADD $0x20,%R15 |
(38) 0x4bd9 CMP %R15,0x178(%RSP) |
(38) 0x4be1 JNE 4ad0 |
0x4be7 VADDPS %YMM1,%YMM2,%YMM0 |
0x4beb VADDPS %YMM0,%YMM3,%YMM0 |
0x4bef VADDPS %YMM0,%YMM4,%YMM0 |
0x4bf3 VEXTRACTF128 $0x1,%YMM0,%XMM1 |
0x4bf9 VADDPS %XMM1,%XMM0,%XMM0 |
0x4bfd VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 |
0x4c02 VADDPS %XMM1,%XMM0,%XMM0 |
0x4c06 VMOVSHDUP %XMM0,%XMM1 |
0x4c0a VADDSS %XMM1,%XMM0,%XMM2 |
0x4c0e MOV 0xc0(%RSP),%R15 |
0x4c16 CMP %R13,%R15 |
0x4c19 JNE 4c26 |
0x4c1b VMOVAPS 0x30(%RSP),%XMM1 |
0x4c21 JMP 4d0b |
0x4c26 TEST $0x1c,%R15B |
0x4c2a VMOVAPS 0x30(%RSP),%XMM1 |
0x4c30 JE 4cd0 |
0x4c36 MOV %R15,%RCX |
0x4c39 MOV %R13,%R15 |
0x4c3c VXORPS %XMM0,%XMM0,%XMM0 |
0x4c40 VBLENDPS $0x1,%XMM2,%XMM0,%XMM2 |
0x4c46 MOV %RCX,%RBX |
0x4c49 AND $-0x4,%RBX |
0x4c4d MOV $0x7ffffffffffffff8,%RAX |
0x4c57 LEA 0x4(%RAX),%R13 |
0x4c5b AND %RCX,%R13 |
0x4c5e VBROADCASTSS %XMM1,%XMM0 |
0x4c63 VMOVAPS %XMM0,0xe0(%RSP) |
0x4c6c NOPL (%RAX) |
(44) 0x4c70 VMOVAPS %XMM2,0x80(%RSP) |
(44) 0x4c79 VMOVUPS (%RDI,%R15,4),%XMM0 |
(44) 0x4c7f VSUBPS 0xe0(%RSP),%XMM0,%XMM0 |
(44) 0x4c88 VZEROUPPER |
(44) 0x4c8b CALL 1080 <_ZGVbN4v_expf@plt> |
(44) 0x4c90 VMOVAPS 0x80(%RSP),%XMM2 |
(44) 0x4c99 MOV 0x10(%RSP),%RDI |
(44) 0x4c9e VADDPS %XMM2,%XMM0,%XMM2 |
(44) 0x4ca2 ADD $0x4,%R15 |
(44) 0x4ca6 CMP %R15,%RBX |
(44) 0x4ca9 JNE 4c70 |
0x4cab VSHUFPD $0x1,%XMM2,%XMM2,%XMM0 |
0x4cb0 VADDPS %XMM0,%XMM2,%XMM0 |
0x4cb4 VMOVSHDUP %XMM0,%XMM1 |
0x4cb8 VADDSS %XMM1,%XMM0,%XMM2 |
0x4cbc MOV 0xc0(%RSP),%R15 |
0x4cc4 CMP %R13,%R15 |
0x4cc7 VMOVAPS 0x30(%RSP),%XMM1 |
0x4ccd JE 4d0b |
0x4ccf NOP |
(39) 0x4cd0 VMOVAPS %XMM2,0x80(%RSP) |
(39) 0x4cd9 VMOVSS (%RDI,%R13,4),%XMM0 |
(39) 0x4cdf VSUBSS %XMM1,%XMM0,%XMM0 |
(39) 0x4ce3 VZEROUPPER |
(39) 0x4ce6 CALL 1130 <expf@plt> |
(39) 0x4ceb VMOVAPS 0x80(%RSP),%XMM2 |
(39) 0x4cf4 VMOVAPS 0x30(%RSP),%XMM1 |
(39) 0x4cfa MOV 0x10(%RSP),%RDI |
(39) 0x4cff VADDSS %XMM2,%XMM0,%XMM2 |
(39) 0x4d03 INC %R13 |
(39) 0x4d06 CMP %R13,%R15 |
(39) 0x4d09 JNE 4cd0 |
0x4d0b CMP $0x8,%R15 |
0x4d0f VMOVAPS %XMM2,0x80(%RSP) |
0x4d18 JAE 4d30 |
0x4d1a MOV %R15,%R13 |
0x4d1d XOR %R15D,%R15D |
0x4d20 MOV 0xd8(%RSP),%RBX |
0x4d28 JMP 4db6 |
0x4d30 MOV $0x7ffffffffffffff8,%RAX |
0x4d3a AND %RAX,%R15 |
0x4d3d VBROADCASTSS %XMM1,%YMM0 |
0x4d42 VMOVAPS %YMM0,0xe0(%RSP) |
0x4d4b VMOVSS 0x12b5(%RIP),%XMM0 |
0x4d53 VDIVSS %XMM2,%XMM0,%XMM0 |
0x4d57 VBROADCASTSS %XMM0,%YMM0 |
0x4d5c VMOVAPS %YMM0,0x120(%RSP) |
0x4d65 XOR %R13D,%R13D |
0x4d68 MOV 0xd8(%RSP),%RBX |
(40) 0x4d70 VMOVUPS (%RDI,%R13,4),%YMM0 |
(40) 0x4d76 VSUBPS 0xe0(%RSP),%YMM0,%YMM0 |
(40) 0x4d7f CALL 10d0 <_ZGVdN8v_expf@plt> |
(40) 0x4d84 MOV 0x10(%RSP),%RDI |
(40) 0x4d89 VMULPS 0x120(%RSP),%YMM0,%YMM0 |
(40) 0x4d92 VMOVUPS %YMM0,(%RBX,%R13,4) |
(40) 0x4d98 ADD $0x8,%R13 |
(40) 0x4d9c CMP %R13,0x40(%RSP) |
(40) 0x4da1 JNE 4d70 |
0x4da3 MOV 0xc0(%RSP),%R13 |
0x4dab CMP %R15,%R13 |
0x4dae VMOVAPS 0x30(%RSP),%XMM1 |
0x4db4 JE 4e01 |
0x4db6 VMOVSS 0x124a(%RIP),%XMM0 |
0x4dbe VDIVSS 0x80(%RSP),%XMM0,%XMM0 |
0x4dc7 VMOVSS %XMM0,0x40(%RSP) |
0x4dcd NOPL (%RAX) |
(43) 0x4dd0 VMOVSS (%RDI,%R15,4),%XMM0 |
(43) 0x4dd6 VSUBSS %XMM1,%XMM0,%XMM0 |
(43) 0x4dda VZEROUPPER |
(43) 0x4ddd CALL 1130 <expf@plt> |
(43) 0x4de2 VMOVAPS 0x30(%RSP),%XMM1 |
(43) 0x4de8 MOV 0x10(%RSP),%RDI |
(43) 0x4ded VMULSS 0x40(%RSP),%XMM0,%XMM0 |
(43) 0x4df3 VMOVSS %XMM0,(%RBX,%R15,4) |
(43) 0x4df9 INC %R15 |
(43) 0x4dfc CMP %R15,%R13 |
(43) 0x4dff JNE 4dd0 |
0x4e01 MOV 0x118(%RSP),%R13 |
0x4e09 LEA 0x1(%R13),%R15 |
0x4e0d CMP 0x20(%RSP),%R15 |
0x4e12 JAE 4860 |
0x4e18 MOV 0x168(%RSP),%RDX |
0x4e20 SUB %R13D,%EDX |
0x4e23 SAL $0x2,%RDX |
0x4e27 MOV $0x3fffffffc,%RAX |
0x4e31 AND %RAX,%RDX |
0x4e34 ADD $0x4,%RDX |
0x4e38 MOV 0x160(%RSP),%RDI |
0x4e40 IMUL %R13,%RDI |
0x4e44 ADD $0x4,%RDI |
0x4e48 AND %RAX,%RDI |
0x4e4b ADD 0xb0(%RSP),%RDI |
0x4e53 XOR %ESI,%ESI |
0x4e55 VZEROUPPER |
0x4e58 CALL 1070 <memset@plt> |
0x4e5d MOV 0x10(%RSP),%RDI |
0x4e62 JMP 4860 |
/home/eoseret/Applications/llm-attention/attention.cpp: 43 - 284 |
-------------------------------------------------------------------------------- |
43: for (int row = 0; row < N; ++row) { |
44: const float *S_row = &S[row * N]; |
45: |
46: float max_val = -FLT_MAX; |
47: for (int idx = 0; idx <= row; ++idx) // vectorised |
48: if (S_row[idx] > max_val) max_val = S_row[idx]; |
49: |
50: float sum = 0.0f; |
51: #pragma clang loop vectorize(enable) |
52: for (int idx = 0; idx <= row; ++idx) // vectorised |
53: sum += expf(S_row[idx] - max_val); |
54: |
55: for (int idx = 0; idx <= row; ++idx) //vectorised |
56: P[row * N + idx] = expf(S_row[idx] - max_val) / sum; |
57: |
58: for (int idx = row + 1; idx < N; ++idx) |
59: P[row * N + idx] = 0.0f; |
60: |
61: D[row] = sum; |
[...] |
284: for (size_t r = 0; r < rept; r++) |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | __libc_init_first | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | attention-avx512 |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 2.09 |
| CQA speedup if FP arith vectorized | 2.12 |
| CQA speedup if fully vectorized | 1.65 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.79 |
| Bottlenecks | micro-operation queue, |
| Function | main |
| Source | attention.cpp:43-44,attention.cpp:47-47,attention.cpp:52-52,attention.cpp:55-55,attention.cpp:58-61,attention.cpp:284-284 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 47.00 |
| CQA cycles if no scalar integer | 22.50 |
| CQA cycles if FP arith vectorized | 22.21 |
| CQA cycles if fully vectorized | 28.56 |
| Front-end cycles | 47.00 |
| P0 cycles | 26.25 |
| P1 cycles | 26.25 |
| P2 cycles | 15.33 |
| P3 cycles | 15.33 |
| P4 cycles | 16.00 |
| P5 cycles | 26.25 |
| P6 cycles | 26.25 |
| P7 cycles | 15.33 |
| DIV/SQRT cycles | 6.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 184.00 |
| Nb uops | 188.00 |
| Nb loads | 30.00 |
| Nb stores | 15.00 |
| Nb stack references | 20.00 |
| FLOP/cycle | 0.85 |
| Nb FLOP add-sub | 38.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 2.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 9.87 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 256.00 |
| Bytes stored | 208.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 42.98 |
| Vectorization ratio load | 22.73 |
| Vectorization ratio store | 40.00 |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | 46.15 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | 0.00 |
| Vectorization ratio other | 49.23 |
| Vector-efficiency ratio all | 21.88 |
| Vector-efficiency ratio load | 13.64 |
| Vector-efficiency ratio store | 21.67 |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | 22.60 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | 6.25 |
| Vector-efficiency ratio other | 24.42 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 2.09 |
| CQA speedup if FP arith vectorized | 2.12 |
| CQA speedup if fully vectorized | 1.65 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.79 |
| Bottlenecks | micro-operation queue, |
| Function | main |
| Source | attention.cpp:43-44,attention.cpp:47-47,attention.cpp:52-52,attention.cpp:55-55,attention.cpp:58-61,attention.cpp:284-284 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 47.00 |
| CQA cycles if no scalar integer | 22.50 |
| CQA cycles if FP arith vectorized | 22.21 |
| CQA cycles if fully vectorized | 28.56 |
| Front-end cycles | 47.00 |
| P0 cycles | 26.25 |
| P1 cycles | 26.25 |
| P2 cycles | 15.33 |
| P3 cycles | 15.33 |
| P4 cycles | 16.00 |
| P5 cycles | 26.25 |
| P6 cycles | 26.25 |
| P7 cycles | 15.33 |
| DIV/SQRT cycles | 6.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 184.00 |
| Nb uops | 188.00 |
| Nb loads | 30.00 |
| Nb stores | 15.00 |
| Nb stack references | 20.00 |
| FLOP/cycle | 0.85 |
| Nb FLOP add-sub | 38.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 2.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 9.87 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 256.00 |
| Bytes stored | 208.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 42.98 |
| Vectorization ratio load | 22.73 |
| Vectorization ratio store | 40.00 |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | 46.15 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | 0.00 |
| Vectorization ratio other | 49.23 |
| Vector-efficiency ratio all | 21.88 |
| Vector-efficiency ratio load | 13.64 |
| Vector-efficiency ratio store | 21.67 |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | 22.60 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | 6.25 |
| Vector-efficiency ratio other | 24.42 |
| Path / |
| Function | main |
| Source file and lines | attention.cpp:43-284 |
| Module | attention-avx512 |
| nb instructions | 184 |
| nb uops | 188 |
| loop length | 934 |
| used x86 registers | 10 |
| used mmx registers | 0 |
| used xmm registers | 5 |
| used ymm registers | 5 |
| used zmm registers | 4 |
| nb stack references | 20 |
| micro-operation queue | 47.00 cycles |
| front end | 47.00 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
|---|---|---|---|---|---|---|---|---|
| uops | 26.25 | 26.25 | 15.33 | 15.33 | 16.00 | 26.25 | 26.25 | 15.33 |
| cycles | 26.25 | 26.25 | 15.33 | 15.33 | 16.00 | 26.25 | 26.25 | 15.33 |
| Cycles executing div or sqrt instructions | 6.00 |
| Front-end | 47.00 |
| Dispatch | 26.25 |
| DIV/SQRT | 6.00 |
| Overall L1 | 47.00 |
| all | 4% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 7% |
| all | 72% |
| load | 45% |
| store | 75% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 75% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 0% |
| other | 78% |
| all | 42% |
| load | 22% |
| store | 40% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 46% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 0% |
| other | 49% |
| all | 12% |
| load | 12% |
| store | 12% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 11% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 12% |
| all | 29% |
| load | 14% |
| store | 29% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 29% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 6% |
| other | 32% |
| all | 21% |
| load | 13% |
| store | 21% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 22% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 6% |
| other | 24% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOV 0xb8(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VMOVAPS 0x80(%RSP),%XMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | vect (25.0%) |
| VMOVSS %XMM0,(%RAX,%R13,4) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (6.3%) |
| MOV 0xc0(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| INC %RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0x20(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| LEA (,%RCX,4),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| MOV 0x170(%RSP),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| ADD %RAX,%R8 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| ADD %RAX,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0xd8(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| ADD %RAX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RSI,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV %R15,0x118(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| CMP %RCX,%R15 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0x50(%RSP),%R15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| MOV 0x48(%RSP),%R13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV 0x110(%RSP),%RBX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| JE 4810 <main+0x2220> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV %RAX,0xd8(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %RDX,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| AND $-0x8,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| CMP $0x8,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RDX,%R15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| JAE 4900 <main+0x2310> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| XOR %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VMOVSS 0x1714(%RIP),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (6.3%) |
| JMP 4a20 <main+0x2430> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| CMP $0x40,%R15 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| JAE 4920 <main+0x2330> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| XOR %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VMOVSS 0x16f4(%RIP),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (6.3%) |
| JMP 49cb <main+0x23db> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| MOV %R15,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| AND $-0x40,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV $0x7ffffffffffffff8,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| ADD $-0x38,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| AND %R15,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| VBROADCASTSS 0x16c0(%RIP),%ZMM3 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 0.50 | scal (6.3%) |
| VMOVAPS %ZMM3,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (100.0%) |
| VMOVAPS %ZMM3,%ZMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (100.0%) |
| VMOVAPS %ZMM3,%ZMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (100.0%) |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VMAXPS %ZMM1,%ZMM0,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VMAXPS %ZMM3,%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VMAXPS %ZMM1,%ZMM0,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VEXTRACTF64X4 $0x1,%ZMM0,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (50.0%) |
| VMAXPS %YMM1,%YMM0,%YMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (50.0%) |
| VEXTRACTF128 $0x1,%YMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VMAXPS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (25.0%) |
| VMAXPS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VMOVSHDUP %XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (12.5%) |
| VMAXSS %XMM1,%XMM0,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| CMP %RAX,%R15 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| JE 4a2d <main+0x243d> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| TEST $0x38,%R15B | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| JE 4a20 <main+0x2430> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV %RAX,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV %R15,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV $0x7ffffffffffffff8,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| AND %RDX,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| VBROADCASTSS %XMM1,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VEXTRACTF128 $0x1,%YMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VMAXPS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (25.0%) |
| VMAXPS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VMOVSHDUP %XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (12.5%) |
| VMAXSS %XMM1,%XMM0,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| JMP 4a28 <main+0x2438> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| CMP $0x4,%R15 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %R15,0xc0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %RDI,0x10(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| VMOVAPS %XMM1,0x30(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | vect (25.0%) |
| MOV %R8,0x170(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %RSI,0x40(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| JAE 4a60 <main+0x2470> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| VXORPS %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| XOR %R13D,%R13D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| JMP 4cd0 <main+0x26e0> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| CMP $0x20,%R15 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| JAE 4a80 <main+0x2490> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| VXORPS %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| XOR %R13D,%R13D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| JMP 4c36 <main+0x2646> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| MOV %R15,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| AND $-0x20,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %RAX,0x178(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV $0x7ffffffffffffff8,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| LEA -0x18(%RAX),%R13 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| AND %R15,%R13 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| VBROADCASTSS %XMM1,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| VMOVAPS %YMM0,0x220(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | vect (50.0%) |
| VXORPS %XMM1,%XMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| XOR %R15D,%R15D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| VXORPS %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| VXORPS %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| VPXOR %XMM4,%XMM4,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VADDPS %YMM1,%YMM2,%YMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (50.0%) |
| VADDPS %YMM0,%YMM3,%YMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (50.0%) |
| VADDPS %YMM0,%YMM4,%YMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (50.0%) |
| VEXTRACTF128 $0x1,%YMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (25.0%) |
| VADDPS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VMOVSHDUP %XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (12.5%) |
| VADDSS %XMM1,%XMM0,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| MOV 0xc0(%RSP),%R15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| CMP %R13,%R15 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| JNE 4c26 <main+0x2636> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| VMOVAPS 0x30(%RSP),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | vect (25.0%) |
| JMP 4d0b <main+0x271b> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| TEST $0x1c,%R15B | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| VMOVAPS 0x30(%RSP),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | vect (25.0%) |
| JE 4cd0 <main+0x26e0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV %R15,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV %R13,%R15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| VXORPS %XMM0,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| VBLENDPS $0x1,%XMM2,%XMM0,%XMM2 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 | vect (25.0%) |
| MOV %RCX,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| AND $-0x4,%RBX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV $0x7ffffffffffffff8,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| LEA 0x4(%RAX),%R13 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| AND %RCX,%R13 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| VBROADCASTSS %XMM1,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (6.3%) |
| VMOVAPS %XMM0,0xe0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | vect (25.0%) |
| NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VSHUFPD $0x1,%XMM2,%XMM2,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (25.0%) |
| VADDPS %XMM0,%XMM2,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VMOVSHDUP %XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (12.5%) |
| VADDSS %XMM1,%XMM0,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| MOV 0xc0(%RSP),%R15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| CMP %R13,%R15 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| VMOVAPS 0x30(%RSP),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | vect (25.0%) |
| JE 4d0b <main+0x271b> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| NOP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| CMP $0x8,%R15 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| VMOVAPS %XMM2,0x80(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | vect (25.0%) |
| JAE 4d30 <main+0x2740> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV %R15,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| XOR %R15D,%R15D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| MOV 0xd8(%RSP),%RBX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| JMP 4db6 <main+0x27c6> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| MOV $0x7ffffffffffffff8,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| AND %RAX,%R15 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| VBROADCASTSS %XMM1,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| VMOVAPS %YMM0,0xe0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | vect (50.0%) |
| VMOVSS 0x12b5(%RIP),%XMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (6.3%) |
| VDIVSS %XMM2,%XMM0,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | 3 | scal (6.3%) |
| VBROADCASTSS %XMM0,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| VMOVAPS %YMM0,0x120(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | vect (50.0%) |
| XOR %R13D,%R13D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV 0xd8(%RSP),%RBX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| MOV 0xc0(%RSP),%R13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| CMP %R15,%R13 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| VMOVAPS 0x30(%RSP),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | vect (25.0%) |
| JE 4e01 <main+0x2811> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| VMOVSS 0x124a(%RIP),%XMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (6.3%) |
| VDIVSS 0x80(%RSP),%XMM0,%XMM0 | 1 | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 11 | 3 | scal (6.3%) |
| VMOVSS %XMM0,0x40(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (6.3%) |
| NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV 0x118(%RSP),%R13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| LEA 0x1(%R13),%R15 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| CMP 0x20(%RSP),%R15 | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| JAE 4860 <main+0x2270> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV 0x168(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| SUB %R13D,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| SAL $0x2,%RDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
| MOV $0x3fffffffc,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| AND %RAX,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| ADD $0x4,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0x160(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| IMUL %R13,%RDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| ADD $0x4,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| AND %RAX,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| ADD 0xb0(%RSP),%RDI | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| XOR %ESI,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| VZEROUPPER | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | vect (25.0%) |
| CALL 1070 <memset@plt> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 | N/A |
| MOV 0x10(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| JMP 4860 <main+0x2270> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| Function | main |
| Source file and lines | attention.cpp:43-284 |
| Module | attention-avx512 |
| nb instructions | 184 |
| nb uops | 188 |
| loop length | 934 |
| used x86 registers | 10 |
| used mmx registers | 0 |
| used xmm registers | 5 |
| used ymm registers | 5 |
| used zmm registers | 4 |
| nb stack references | 20 |
| micro-operation queue | 47.00 cycles |
| front end | 47.00 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
|---|---|---|---|---|---|---|---|---|
| uops | 26.25 | 26.25 | 15.33 | 15.33 | 16.00 | 26.25 | 26.25 | 15.33 |
| cycles | 26.25 | 26.25 | 15.33 | 15.33 | 16.00 | 26.25 | 26.25 | 15.33 |
| Cycles executing div or sqrt instructions | 6.00 |
| Front-end | 47.00 |
| Dispatch | 26.25 |
| DIV/SQRT | 6.00 |
| Overall L1 | 47.00 |
| all | 4% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 7% |
| all | 72% |
| load | 45% |
| store | 75% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 75% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 0% |
| other | 78% |
| all | 42% |
| load | 22% |
| store | 40% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 46% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 0% |
| other | 49% |
| all | 12% |
| load | 12% |
| store | 12% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 11% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 12% |
| all | 29% |
| load | 14% |
| store | 29% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 29% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 6% |
| other | 32% |
| all | 21% |
| load | 13% |
| store | 21% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 22% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 6% |
| other | 24% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOV 0xb8(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VMOVAPS 0x80(%RSP),%XMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | vect (25.0%) |
| VMOVSS %XMM0,(%RAX,%R13,4) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (6.3%) |
| MOV 0xc0(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| INC %RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0x20(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| LEA (,%RCX,4),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| MOV 0x170(%RSP),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| ADD %RAX,%R8 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| ADD %RAX,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0xd8(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| ADD %RAX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RSI,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV %R15,0x118(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| CMP %RCX,%R15 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0x50(%RSP),%R15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| MOV 0x48(%RSP),%R13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV 0x110(%RSP),%RBX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| JE 4810 <main+0x2220> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV %RAX,0xd8(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %RDX,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| AND $-0x8,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| CMP $0x8,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RDX,%R15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| JAE 4900 <main+0x2310> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| XOR %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VMOVSS 0x1714(%RIP),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (6.3%) |
| JMP 4a20 <main+0x2430> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| CMP $0x40,%R15 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| JAE 4920 <main+0x2330> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| XOR %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VMOVSS 0x16f4(%RIP),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (6.3%) |
| JMP 49cb <main+0x23db> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| MOV %R15,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| AND $-0x40,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV $0x7ffffffffffffff8,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| ADD $-0x38,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| AND %R15,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| VBROADCASTSS 0x16c0(%RIP),%ZMM3 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 0.50 | scal (6.3%) |
| VMOVAPS %ZMM3,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (100.0%) |
| VMOVAPS %ZMM3,%ZMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (100.0%) |
| VMOVAPS %ZMM3,%ZMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (100.0%) |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VMAXPS %ZMM1,%ZMM0,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VMAXPS %ZMM3,%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VMAXPS %ZMM1,%ZMM0,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 | vect (100.0%) |
| VEXTRACTF64X4 $0x1,%ZMM0,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (50.0%) |
| VMAXPS %YMM1,%YMM0,%YMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (50.0%) |
| VEXTRACTF128 $0x1,%YMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VMAXPS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (25.0%) |
| VMAXPS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VMOVSHDUP %XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (12.5%) |
| VMAXSS %XMM1,%XMM0,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| CMP %RAX,%R15 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| JE 4a2d <main+0x243d> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| TEST $0x38,%R15B | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| JE 4a20 <main+0x2430> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV %RAX,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV %R15,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV $0x7ffffffffffffff8,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| AND %RDX,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| VBROADCASTSS %XMM1,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VEXTRACTF128 $0x1,%YMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VMAXPS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (25.0%) |
| VMAXPS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VMOVSHDUP %XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (12.5%) |
| VMAXSS %XMM1,%XMM0,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| JMP 4a28 <main+0x2438> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| CMP $0x4,%R15 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %R15,0xc0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %RDI,0x10(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| VMOVAPS %XMM1,0x30(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | vect (25.0%) |
| MOV %R8,0x170(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %RSI,0x40(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| JAE 4a60 <main+0x2470> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| VXORPS %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| XOR %R13D,%R13D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| JMP 4cd0 <main+0x26e0> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| CMP $0x20,%R15 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| JAE 4a80 <main+0x2490> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| VXORPS %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| XOR %R13D,%R13D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| JMP 4c36 <main+0x2646> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| MOV %R15,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| AND $-0x20,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %RAX,0x178(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV $0x7ffffffffffffff8,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| LEA -0x18(%RAX),%R13 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| AND %R15,%R13 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| VBROADCASTSS %XMM1,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| VMOVAPS %YMM0,0x220(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | vect (50.0%) |
| VXORPS %XMM1,%XMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| XOR %R15D,%R15D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| VXORPS %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| VXORPS %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| VPXOR %XMM4,%XMM4,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VADDPS %YMM1,%YMM2,%YMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (50.0%) |
| VADDPS %YMM0,%YMM3,%YMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (50.0%) |
| VADDPS %YMM0,%YMM4,%YMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (50.0%) |
| VEXTRACTF128 $0x1,%YMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VADDPS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (25.0%) |
| VADDPS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VMOVSHDUP %XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (12.5%) |
| VADDSS %XMM1,%XMM0,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| MOV 0xc0(%RSP),%R15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| CMP %R13,%R15 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| JNE 4c26 <main+0x2636> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| VMOVAPS 0x30(%RSP),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | vect (25.0%) |
| JMP 4d0b <main+0x271b> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| TEST $0x1c,%R15B | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| VMOVAPS 0x30(%RSP),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | vect (25.0%) |
| JE 4cd0 <main+0x26e0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV %R15,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV %R13,%R15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| VXORPS %XMM0,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| VBLENDPS $0x1,%XMM2,%XMM0,%XMM2 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 | vect (25.0%) |
| MOV %RCX,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| AND $-0x4,%RBX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV $0x7ffffffffffffff8,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| LEA 0x4(%RAX),%R13 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| AND %RCX,%R13 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| VBROADCASTSS %XMM1,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (6.3%) |
| VMOVAPS %XMM0,0xe0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | vect (25.0%) |
| NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VSHUFPD $0x1,%XMM2,%XMM2,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (25.0%) |
| VADDPS %XMM0,%XMM2,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | vect (25.0%) |
| VMOVSHDUP %XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | vect (12.5%) |
| VADDSS %XMM1,%XMM0,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| MOV 0xc0(%RSP),%R15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| CMP %R13,%R15 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| VMOVAPS 0x30(%RSP),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | vect (25.0%) |
| JE 4d0b <main+0x271b> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| NOP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| CMP $0x8,%R15 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| VMOVAPS %XMM2,0x80(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | vect (25.0%) |
| JAE 4d30 <main+0x2740> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV %R15,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| XOR %R15D,%R15D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| MOV 0xd8(%RSP),%RBX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| JMP 4db6 <main+0x27c6> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| MOV $0x7ffffffffffffff8,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| AND %RAX,%R15 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| VBROADCASTSS %XMM1,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| VMOVAPS %YMM0,0xe0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | vect (50.0%) |
| VMOVSS 0x12b5(%RIP),%XMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (6.3%) |
| VDIVSS %XMM2,%XMM0,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | 3 | scal (6.3%) |
| VBROADCASTSS %XMM0,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (6.3%) |
| VMOVAPS %YMM0,0x120(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | vect (50.0%) |
| XOR %R13D,%R13D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV 0xd8(%RSP),%RBX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| MOV 0xc0(%RSP),%R13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| CMP %R15,%R13 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| VMOVAPS 0x30(%RSP),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | vect (25.0%) |
| JE 4e01 <main+0x2811> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| VMOVSS 0x124a(%RIP),%XMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (6.3%) |
| VDIVSS 0x80(%RSP),%XMM0,%XMM0 | 1 | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 11 | 3 | scal (6.3%) |
| VMOVSS %XMM0,0x40(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (6.3%) |
| NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV 0x118(%RSP),%R13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| LEA 0x1(%R13),%R15 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| CMP 0x20(%RSP),%R15 | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| JAE 4860 <main+0x2270> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV 0x168(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| SUB %R13D,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| SAL $0x2,%RDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
| MOV $0x3fffffffc,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| AND %RAX,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| ADD $0x4,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0x160(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| IMUL %R13,%RDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | N/A |
| ADD $0x4,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| AND %RAX,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| ADD 0xb0(%RSP),%RDI | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| XOR %ESI,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| VZEROUPPER | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | vect (25.0%) |
| CALL 1070 <memset@plt> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 | N/A |
| MOV 0x10(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| JMP 4860 <main+0x2270> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
