| Loop Id: 77 | Module: attention-clang-skl256 | Source: attention_v2.cpp:163-163 [...] | Coverage: 0.93% |
|---|
| Loop Id: 77 | Module: attention-clang-skl256 | Source: attention_v2.cpp:163-163 [...] | Coverage: 0.93% |
|---|
0x2ea0 MOV %RCX,%RDX |
0x2ea3 INC %RCX |
0x2ea6 MOV %RCX,0x1730(%RSP) |
0x2eae MOV 0x3b0(%RSP,%RDX,8),%RDX |
0x2eb6 MOV %RDX,%RSI |
0x2eb9 SHR $0xb,%RSI |
0x2ebd MOV %ESI,%ESI |
0x2ebf XOR %RDX,%RSI |
0x2ec2 MOV %ESI,%EDX |
0x2ec4 SAL $0x7,%EDX |
0x2ec7 AND $-0x62d3a980,%EDX |
0x2ecd XOR %RSI,%RDX |
0x2ed0 MOV %EDX,%ESI |
0x2ed2 SAL $0xf,%ESI |
0x2ed5 AND $-0x103a0000,%ESI |
0x2edb XOR %RDX,%RSI |
0x2ede MOV %RSI,%RDX |
0x2ee1 SHR $0x12,%RDX |
0x2ee5 XOR %ESI,%EDX |
0x2ee7 VCVTUSI2SS %EDX,%XMM15,%XMM0 |
0x2eed VMULSS 0x3113(%RIP),%XMM0,%XMM0 |
0x2ef5 VUCOMISS 0x310f(%RIP),%XMM0 |
0x2efd JB 3265 |
0x2f03 CMP $0x270,%RCX |
0x2f0a JB 2ea0 |
0x2f0c VPBROADCASTQ 0x3b0(%RSP),%YMM0 |
0x2f16 XOR %ECX,%ECX |
0x2f18 VPBROADCASTQ 0x3117(%RIP),%YMM12 |
0x2f21 VPBROADCASTQ 0x3116(%RIP),%YMM13 |
0x2f2a VPBROADCASTQ 0x3115(%RIP),%YMM14 |
0x2f33 VPBROADCASTQ 0x3114(%RIP),%YMM15 |
0x2f3c NOPL (%RAX) |
(78) 0x2f40 VMOVDQA %YMM0,%YMM1 |
(78) 0x2f44 VMOVDQU 0x3b8(%RSP,%RCX,8),%YMM2 |
(78) 0x2f4d VMOVDQU 0x3d8(%RSP,%RCX,8),%YMM3 |
(78) 0x2f56 VMOVDQU 0x3f8(%RSP,%RCX,8),%YMM4 |
(78) 0x2f5f VMOVDQU 0x418(%RSP,%RCX,8),%YMM0 |
(78) 0x2f68 VALIGNQ $0x3,%YMM1,%YMM2,%YMM1 |
(78) 0x2f6f VALIGNQ $0x3,%YMM2,%YMM3,%YMM5 |
(78) 0x2f76 VALIGNQ $0x3,%YMM3,%YMM4,%YMM6 |
(78) 0x2f7d VALIGNQ $0x3,%YMM4,%YMM0,%YMM7 |
(78) 0x2f84 VPAND %YMM2,%YMM13,%YMM8 |
(78) 0x2f88 VPAND %YMM3,%YMM13,%YMM9 |
(78) 0x2f8c VPAND %YMM4,%YMM13,%YMM10 |
(78) 0x2f90 VPAND %YMM0,%YMM13,%YMM11 |
(78) 0x2f94 VPTERNLOGQ $-0x8,%YMM12,%YMM1,%YMM8 |
(78) 0x2f9b VPTERNLOGQ $-0x8,%YMM12,%YMM5,%YMM9 |
(78) 0x2fa2 VPTERNLOGQ $-0x8,%YMM12,%YMM6,%YMM10 |
(78) 0x2fa9 VPTERNLOGQ $-0x8,%YMM12,%YMM7,%YMM11 |
(78) 0x2fb0 VPSRLQ $0x1,%YMM8,%YMM1 |
(78) 0x2fb6 VPSRLQ $0x1,%YMM9,%YMM5 |
(78) 0x2fbc VPSRLQ $0x1,%YMM10,%YMM6 |
(78) 0x2fc2 VPSRLQ $0x1,%YMM11,%YMM7 |
(78) 0x2fc8 VPXOR 0x1018(%RSP,%RCX,8),%YMM1,%YMM1 |
(78) 0x2fd1 VPXOR 0x1038(%RSP,%RCX,8),%YMM5,%YMM5 |
(78) 0x2fda VPXOR 0x1058(%RSP,%RCX,8),%YMM6,%YMM6 |
(78) 0x2fe3 VPXOR 0x1078(%RSP,%RCX,8),%YMM7,%YMM7 |
(78) 0x2fec VPTESTMQ %YMM14,%YMM2,%K1 |
(78) 0x2ff2 VPTESTMQ %YMM14,%YMM3,%K2 |
(78) 0x2ff8 VPTESTMQ %YMM14,%YMM4,%K3 |
(78) 0x2ffe VPTESTMQ %YMM14,%YMM0,%K4 |
(78) 0x3004 VPXORQ %YMM15,%YMM1,%YMM1{%K1} |
(78) 0x300a VPXORQ %YMM15,%YMM5,%YMM5{%K2} |
(78) 0x3010 VPXORQ %YMM15,%YMM6,%YMM6{%K3} |
(78) 0x3016 VPXORQ %YMM15,%YMM7,%YMM7{%K4} |
(78) 0x301c VMOVDQU %YMM1,0x3b0(%RSP,%RCX,8) |
(78) 0x3025 VMOVDQU %YMM5,0x3d0(%RSP,%RCX,8) |
(78) 0x302e VMOVDQU %YMM6,0x3f0(%RSP,%RCX,8) |
(78) 0x3037 VMOVDQU %YMM7,0x410(%RSP,%RCX,8) |
(78) 0x3040 ADD $0x10,%RCX |
(78) 0x3044 CMP $0xe0,%RCX |
(78) 0x304b JNE 2f40 |
0x3051 VEXTRACTI128 $0x1,%YMM0,%XMM0 |
0x3057 VPEXTRQ $0x1,%XMM0,%RSI |
0x305d AND $-0x80000000,%RSI |
0x3064 MOV 0xab8(%RSP),%RDX |
0x306c MOV 0xac0(%RSP),%RCX |
0x3074 MOV %EDX,%EDI |
0x3076 AND $0x7ffffffe,%EDI |
0x307c OR %RSI,%RDI |
0x307f SHR $0x1,%RDI |
0x3082 XOR 0x1718(%RSP),%RDI |
0x308a MOV %EDX,%ESI |
0x308c AND $0x1,%ESI |
0x308f NEG %ESI |
0x3091 MOV $-0x66f74f21,%R8D |
0x3097 AND %R8D,%ESI |
0x309a XOR %RDI,%RSI |
0x309d MOV %RSI,0xab0(%RSP) |
0x30a5 AND $-0x80000000,%RDX |
0x30ac MOV %ECX,%ESI |
0x30ae AND $0x7ffffffe,%ESI |
0x30b4 OR %RDX,%RSI |
0x30b7 SHR $0x1,%RSI |
0x30ba XOR 0x1720(%RSP),%RSI |
0x30c2 MOV %ECX,%EDX |
0x30c4 AND $0x1,%EDX |
0x30c7 NEG %EDX |
0x30c9 AND %R8D,%EDX |
0x30cc XOR %RSI,%RDX |
0x30cf MOV %RDX,0xab8(%RSP) |
0x30d7 AND $-0x80000000,%RCX |
0x30de MOV 0xac8(%RSP),%RDX |
0x30e6 MOV %EDX,%ESI |
0x30e8 VPBROADCASTQ %RDX,%XMM0 |
0x30ee AND $0x7ffffffe,%EDX |
0x30f4 OR %RCX,%RDX |
0x30f7 SHR $0x1,%RDX |
0x30fa XOR 0x1728(%RSP),%RDX |
0x3102 AND $0x1,%ESI |
0x3105 NEG %ESI |
0x3107 MOV $-0x66f74f21,%EDI |
0x310c AND %R8D,%ESI |
0x310f XOR %RDX,%RSI |
0x3112 MOV %RSI,0xac0(%RSP) |
0x311a MOV $0xe8,%ECX |
0x311f VPBROADCASTQ 0x2f10(%RIP),%XMM5 |
0x3128 VPBROADCASTQ 0x2f0f(%RIP),%XMM6 |
0x3131 VPBROADCASTQ 0x2f0e(%RIP),%XMM7 |
0x313a VPBROADCASTQ 0x2f0d(%RIP),%XMM8 |
0x3143 NOPW %CS:(%RAX,%RAX,1) |
(79) 0x3150 VMOVDQU 0x390(%RSP,%RCX,8),%XMM1 |
(79) 0x3159 VMOVDQU 0x3a0(%RSP,%RCX,8),%XMM2 |
(79) 0x3162 VPALIGNR $0x8,%XMM0,%XMM1,%XMM0 |
(79) 0x3168 VMOVDQU 0x3b0(%RSP,%RCX,8),%XMM3 |
(79) 0x3171 VPAND %XMM6,%XMM1,%XMM4 |
(79) 0x3175 VPTERNLOGQ $-0x8,%XMM5,%XMM0,%XMM4 |
(79) 0x317c VPSRLQ $0x1,%XMM4,%XMM0 |
(79) 0x3181 VPXOR -0x390(%RSP,%RCX,8),%XMM0,%XMM0 |
(79) 0x318a VPTESTMQ %XMM7,%XMM1,%K1 |
(79) 0x3190 VPXORQ %XMM8,%XMM0,%XMM0{%K1} |
(79) 0x3196 VMOVDQU %XMM0,0x388(%RSP,%RCX,8) |
(79) 0x319f VPALIGNR $0x8,%XMM1,%XMM2,%XMM0 |
(79) 0x31a5 VPAND %XMM6,%XMM2,%XMM1 |
(79) 0x31a9 VPTERNLOGQ $-0x8,%XMM5,%XMM0,%XMM1 |
(79) 0x31b0 VPSRLQ $0x1,%XMM1,%XMM0 |
(79) 0x31b5 VPXOR -0x380(%RSP,%RCX,8),%XMM0,%XMM0 |
(79) 0x31be VPTESTMQ %XMM7,%XMM2,%K1 |
(79) 0x31c4 VPXORQ %XMM8,%XMM0,%XMM0{%K1} |
(79) 0x31ca VMOVDQU %XMM0,0x398(%RSP,%RCX,8) |
(79) 0x31d3 VPALIGNR $0x8,%XMM2,%XMM3,%XMM0 |
(79) 0x31d9 VPAND %XMM6,%XMM3,%XMM1 |
(79) 0x31dd VPTERNLOGQ $-0x8,%XMM5,%XMM0,%XMM1 |
(79) 0x31e4 VPSRLQ $0x1,%XMM1,%XMM0 |
(79) 0x31e9 VPXOR -0x370(%RSP,%RCX,8),%XMM0,%XMM0 |
(79) 0x31f2 VPTESTMQ %XMM7,%XMM3,%K1 |
(79) 0x31f8 VPXORQ %XMM8,%XMM0,%XMM0{%K1} |
(79) 0x31fe VMOVDQU %XMM0,0x3a8(%RSP,%RCX,8) |
(79) 0x3207 ADD $0x6,%RCX |
(79) 0x320b VMOVDQA %XMM3,%XMM0 |
(79) 0x320f CMP $0x274,%RCX |
(79) 0x3216 JNE 3150 |
0x321c MOV 0x1728(%RSP),%RCX |
0x3224 MOV $-0x80000000,%RDX |
0x322b AND %RDX,%RCX |
0x322e MOV 0x3b0(%RSP),%RDX |
0x3236 MOV %EDX,%ESI |
0x3238 AND $0x7ffffffe,%ESI |
0x323e OR %RCX,%RSI |
0x3241 SHR $0x1,%RSI |
0x3244 XOR 0x1010(%RSP),%RSI |
0x324c AND $0x1,%EDX |
0x324f NEG %EDX |
0x3251 AND %EDI,%EDX |
0x3253 XOR %RSI,%RDX |
0x3256 MOV %RDX,0x1728(%RSP) |
0x325e XOR %ECX,%ECX |
0x3260 JMP 2ea0 |
0x3265 VMOVSS %XMM0,(%R14,%RAX,4) |
0x326b INC %RAX |
0x326e CMP 0x60(%RSP),%RAX |
0x3273 JNE 2f03 |
/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/16.1.1/../../../../include/c++/16.1.1/bits/random.tcc: 404 - 3558 |
-------------------------------------------------------------------------------- |
404: for (size_t __k = 0; __k < (__n - __m); ++__k) |
405: { |
406: _UIntType __y = ((_M_x[__k] & __upper_mask) |
407: | (_M_x[__k + 1] & __lower_mask)); |
408: _M_x[__k] = (_M_x[__k + __m] ^ (__y >> 1) |
409: ^ ((__y & 0x01) ? __a : 0)); |
410: } |
411: |
412: for (size_t __k = (__n - __m); __k < (__n - 1); ++__k) |
413: { |
414: _UIntType __y = ((_M_x[__k] & __upper_mask) |
415: | (_M_x[__k + 1] & __lower_mask)); |
416: _M_x[__k] = (_M_x[__k + (__m - __n)] ^ (__y >> 1) |
417: ^ ((__y & 0x01) ? __a : 0)); |
418: } |
419: |
420: _UIntType __y = ((_M_x[__n - 1] & __upper_mask) |
421: | (_M_x[0] & __lower_mask)); |
422: _M_x[__n - 1] = (_M_x[__m - 1] ^ (__y >> 1) |
423: ^ ((__y & 0x01) ? __a : 0)); |
[...] |
458: if (_M_p >= state_size) |
459: _M_gen_rand(); |
460: |
461: // Calculate o(x(i)). |
462: result_type __z = _M_x[_M_p++]; |
463: __z ^= (__z >> __u) & __d; |
464: __z ^= (__z << __s) & __b; |
465: __z ^= (__z << __t) & __c; |
466: __z ^= (__z >> __l); |
[...] |
3557: const _RealT __ret = _RealT(__sum >> __log2_x) / _RealT(__rd); |
3558: if (__ret < _RealT(1.0)) |
/home/eoseret/Applications/llm-attention/attention_v2.cpp: 163 - 163 |
-------------------------------------------------------------------------------- |
163: for (size_t i = 0; i < elemsX; ++i) h_X[i] = dist(rng); |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | __libc_init_first | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | attention-clang-skl256 |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 3.91 |
| CQA speedup if FP arith vectorized | 2.08 |
| CQA speedup if fully vectorized | 12.24 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.56 |
| Bottlenecks | |
| Function | main |
| Source | random.tcc:404-409,random.tcc:412-412,random.tcc:420-423,random.tcc:458-458,random.tcc:462-466,random.tcc:3557-3558,attention_v2.cpp:163-163 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 16.63 |
| CQA cycles if no scalar integer | 4.25 |
| CQA cycles if FP arith vectorized | 7.99 |
| CQA cycles if fully vectorized | 1.36 |
| Front-end cycles | 16.63 |
| P0 cycles | 10.63 |
| P1 cycles | 10.63 |
| P2 cycles | 6.50 |
| P3 cycles | 6.50 |
| P4 cycles | 4.00 |
| P5 cycles | 10.63 |
| P6 cycles | 10.63 |
| P7 cycles | 4.00 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | 1 |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 65.00 |
| Nb uops | 66.50 |
| Nb loads | 13.00 |
| Nb stores | 4.00 |
| Nb stack references | 6.50 |
| FLOP/cycle | 0.06 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 1.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 6.52 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 96.00 |
| Bytes stored | 28.00 |
| Stride 0 | 0.00 |
| Stride 1 | 0.00 |
| Stride n | 0.00 |
| Stride unknown | 2.00 |
| Stride indirect | 0.00 |
| Vectorization ratio all | 0.76 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | 0.00 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 0.89 |
| Vector-efficiency ratio all | 9.65 |
| Vector-efficiency ratio load | 8.98 |
| Vector-efficiency ratio store | 10.42 |
| Vector-efficiency ratio mul | 6.25 |
| Vector-efficiency ratio add_sub | 12.50 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 9.51 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 4.29 |
| CQA speedup if FP arith vectorized | 1.95 |
| CQA speedup if fully vectorized | 11.80 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.58 |
| Bottlenecks | micro-operation queue, |
| Function | main |
| Source | random.tcc:404-409,random.tcc:412-412,random.tcc:420-423,random.tcc:458-458,random.tcc:462-466,random.tcc:3557-3558,attention_v2.cpp:163-163 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 25.75 |
| CQA cycles if no scalar integer | 6.00 |
| CQA cycles if FP arith vectorized | 13.20 |
| CQA cycles if fully vectorized | 2.18 |
| Front-end cycles | 25.75 |
| P0 cycles | 16.25 |
| P1 cycles | 16.25 |
| P2 cycles | 11.00 |
| P3 cycles | 11.00 |
| P4 cycles | 6.00 |
| P5 cycles | 16.25 |
| P6 cycles | 16.25 |
| P7 cycles | 6.00 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | 1 |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 101.00 |
| Nb uops | 103.00 |
| Nb loads | 22.00 |
| Nb stores | 6.00 |
| Nb stack references | 11.00 |
| FLOP/cycle | 0.04 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 1.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 8.23 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 168.00 |
| Bytes stored | 44.00 |
| Stride 0 | 0.00 |
| Stride 1 | 0.00 |
| Stride n | 0.00 |
| Stride unknown | 2.00 |
| Stride indirect | 0.00 |
| Vectorization ratio all | 1.52 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | 0.00 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 1.79 |
| Vector-efficiency ratio all | 10.13 |
| Vector-efficiency ratio load | 11.72 |
| Vector-efficiency ratio store | 11.46 |
| Vector-efficiency ratio mul | 6.25 |
| Vector-efficiency ratio add_sub | 12.50 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 9.93 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 3.00 |
| CQA speedup if FP arith vectorized | 2.71 |
| CQA speedup if fully vectorized | 14.06 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.50 |
| Bottlenecks | micro-operation queue, |
| Function | main |
| Source | random.tcc:404-409,random.tcc:412-412,random.tcc:420-423,random.tcc:458-458,random.tcc:462-466,random.tcc:3557-3558,attention_v2.cpp:163-163 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 7.50 |
| CQA cycles if no scalar integer | 2.50 |
| CQA cycles if FP arith vectorized | 2.77 |
| CQA cycles if fully vectorized | 0.53 |
| Front-end cycles | 7.50 |
| P0 cycles | 5.00 |
| P1 cycles | 5.00 |
| P2 cycles | 2.00 |
| P3 cycles | 2.00 |
| P4 cycles | 2.00 |
| P5 cycles | 5.00 |
| P6 cycles | 5.00 |
| P7 cycles | 2.00 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | 1 |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 29.00 |
| Nb uops | 30.00 |
| Nb loads | 4.00 |
| Nb stores | 2.00 |
| Nb stack references | 2.00 |
| FLOP/cycle | 0.13 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 1.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 4.80 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 24.00 |
| Bytes stored | 12.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 0.00 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | 0.00 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 0.00 |
| Vector-efficiency ratio all | 9.17 |
| Vector-efficiency ratio load | 6.25 |
| Vector-efficiency ratio store | 9.38 |
| Vector-efficiency ratio mul | 6.25 |
| Vector-efficiency ratio add_sub | 12.50 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 9.09 |
| Path / |
| Function | main |
| Source file and lines | attention_v2.cpp:163-163 |
| Module | attention-clang-skl256 |
| nb instructions | 65 |
| nb uops | 66.50 |
| loop length | 318 |
| used x86 registers | 7 |
| used mmx registers | 0 |
| used xmm registers | 4 |
| used ymm registers | 2.50 |
| used zmm registers | 0 |
| nb stack references | 6.50 |
| micro-operation queue | 16.63 cycles |
| front end | 16.63 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
|---|---|---|---|---|---|---|---|---|
| uops | 10.63 | 10.63 | 6.50 | 6.50 | 4.00 | 10.63 | 10.63 | 4.00 |
| cycles | 10.63 | 10.63 | 6.50 | 6.50 | 4.00 | 10.63 | 10.63 | 4.00 |
| Cycles executing div or sqrt instructions | NA |
| Longest recurrence chain latency (RecMII) | 1.00 |
| Front-end | 16.63 |
| Dispatch | 10.63 |
| Data deps. | 1.00 |
| Overall L1 | 16.63 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 0% |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 10% |
| load | 12% |
| store | 12% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 9% |
| all | 6% |
| load | 6% |
| store | 6% |
| mul | 6% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 6% |
| all | 9% |
| load | 8% |
| store | 10% |
| mul | 6% |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 9% |
| Function | main |
| Source file and lines | attention_v2.cpp:163-163 |
| Module | attention-clang-skl256 |
| nb instructions | 101 |
| nb uops | 103 |
| loop length | 508 |
| used x86 registers | 8 |
| used mmx registers | 0 |
| used xmm registers | 6 |
| used ymm registers | 5 |
| used zmm registers | 0 |
| nb stack references | 11 |
| micro-operation queue | 25.75 cycles |
| front end | 25.75 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
|---|---|---|---|---|---|---|---|---|
| uops | 16.25 | 16.25 | 11.00 | 11.00 | 6.00 | 16.25 | 16.25 | 6.00 |
| cycles | 16.25 | 16.25 | 11.00 | 11.00 | 6.00 | 16.25 | 16.25 | 6.00 |
| Cycles executing div or sqrt instructions | NA |
| Longest recurrence chain latency (RecMII) | 1.00 |
| Front-end | 25.75 |
| Dispatch | 16.25 |
| Data deps. | 1.00 |
| Overall L1 | 25.75 |
| all | 1% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 1% |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 1% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 1% |
| all | 10% |
| load | 12% |
| store | 12% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 10% |
| all | 6% |
| load | 6% |
| store | 6% |
| mul | 6% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 6% |
| all | 10% |
| load | 11% |
| store | 11% |
| mul | 6% |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 9% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOV %RCX,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| INC %RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RCX,0x1730(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV 0x3b0(%RSP,%RDX,8),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV %RDX,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| SHR $0xb,%RSI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
| MOV %ESI,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| XOR %RDX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %ESI,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| SAL $0x7,%EDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| AND $-0x62d3a980,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| XOR %RSI,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %EDX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| SAL $0xf,%ESI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (6.3%) |
| AND $-0x103a0000,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| XOR %RDX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RSI,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| SHR $0x12,%RDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| XOR %ESI,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| VCVTUSI2SS %EDX,%XMM15,%XMM0 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 6 | 1 | scal (6.3%) |
| VMULSS 0x3113(%RIP),%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| VUCOMISS 0x310f(%RIP),%XMM0 | 2 | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 | scal (6.3%) |
| JB 3265 <main+0xd85> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| CMP $0x270,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| JB 2ea0 <main+0x9c0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| VPBROADCASTQ 0x3b0(%RSP),%YMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| VPBROADCASTQ 0x3117(%RIP),%YMM12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| VPBROADCASTQ 0x3116(%RIP),%YMM13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| VPBROADCASTQ 0x3115(%RIP),%YMM14 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| VPBROADCASTQ 0x3114(%RIP),%YMM15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VEXTRACTI128 $0x1,%YMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VPEXTRQ $0x1,%XMM0,%RSI | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (12.5%) |
| AND $-0x80000000,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0xab8(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV 0xac0(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| MOV %EDX,%EDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| AND $0x7ffffffe,%EDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| OR %RSI,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| SHR $0x1,%RDI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
| XOR 0x1718(%RSP),%RDI | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| MOV %EDX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| AND $0x1,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| NEG %ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| MOV $-0x66f74f21,%R8D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| AND %R8D,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| XOR %RDI,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RSI,0xab0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| AND $-0x80000000,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %ECX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| AND $0x7ffffffe,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| OR %RDX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| SHR $0x1,%RSI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
| XOR 0x1720(%RSP),%RSI | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| MOV %ECX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| AND $0x1,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| NEG %EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| AND %R8D,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| XOR %RSI,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %RDX,0xab8(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| AND $-0x80000000,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0xac8(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV %EDX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| VPBROADCASTQ %RDX,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| AND $0x7ffffffe,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| OR %RCX,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| SHR $0x1,%RDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| XOR 0x1728(%RSP),%RDX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| AND $0x1,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| NEG %ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| MOV $-0x66f74f21,%EDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| AND %R8D,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| XOR %RDX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RSI,0xac0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV $0xe8,%ECX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| VPBROADCASTQ 0x2f10(%RIP),%XMM5 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| VPBROADCASTQ 0x2f0f(%RIP),%XMM6 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| VPBROADCASTQ 0x2f0e(%RIP),%XMM7 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| VPBROADCASTQ 0x2f0d(%RIP),%XMM8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV 0x1728(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| MOV $-0x80000000,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| AND %RDX,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0x3b0(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV %EDX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| AND $0x7ffffffe,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| OR %RCX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| SHR $0x1,%RSI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
| XOR 0x1010(%RSP),%RSI | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| AND $0x1,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| NEG %EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| AND %EDI,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| XOR %RSI,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %RDX,0x1728(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| JMP 2ea0 <main+0x9c0> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| VMOVSS %XMM0,(%R14,%RAX,4) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (6.3%) |
| INC %RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| CMP 0x60(%RSP),%RAX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| JNE 2f03 <main+0xa23> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| Function | main |
| Source file and lines | attention_v2.cpp:163-163 |
| Module | attention-clang-skl256 |
| nb instructions | 29 |
| nb uops | 30 |
| loop length | 128 |
| used x86 registers | 6 |
| used mmx registers | 0 |
| used xmm registers | 2 |
| used ymm registers | 0 |
| used zmm registers | 0 |
| nb stack references | 2 |
| micro-operation queue | 7.50 cycles |
| front end | 7.50 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
|---|---|---|---|---|---|---|---|---|
| uops | 5.00 | 5.00 | 2.00 | 2.00 | 2.00 | 5.00 | 5.00 | 2.00 |
| cycles | 5.00 | 5.00 | 2.00 | 2.00 | 2.00 | 5.00 | 5.00 | 2.00 |
| Cycles executing div or sqrt instructions | NA |
| Longest recurrence chain latency (RecMII) | 1.00 |
| Front-end | 7.50 |
| Dispatch | 5.00 |
| Data deps. | 1.00 |
| Overall L1 | 7.50 |
| all | 0% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 0% |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 9% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | 12% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 9% |
| all | 6% |
| load | 6% |
| store | 6% |
| mul | 6% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 6% |
| all | 9% |
| load | 6% |
| store | 9% |
| mul | 6% |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 9% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOV %RCX,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| INC %RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RCX,0x1730(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV 0x3b0(%RSP,%RDX,8),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV %RDX,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| SHR $0xb,%RSI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
| MOV %ESI,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| XOR %RDX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %ESI,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| SAL $0x7,%EDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| AND $-0x62d3a980,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| XOR %RSI,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %EDX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| SAL $0xf,%ESI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (6.3%) |
| AND $-0x103a0000,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| XOR %RDX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RSI,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| SHR $0x12,%RDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| XOR %ESI,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| VCVTUSI2SS %EDX,%XMM15,%XMM0 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 6 | 1 | scal (6.3%) |
| VMULSS 0x3113(%RIP),%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| VUCOMISS 0x310f(%RIP),%XMM0 | 2 | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 | scal (6.3%) |
| JB 3265 <main+0xd85> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| CMP $0x270,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| JB 2ea0 <main+0x9c0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| VMOVSS %XMM0,(%R14,%RAX,4) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (6.3%) |
| INC %RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| CMP 0x60(%RSP),%RAX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| JNE 2f03 <main+0xa23> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
