| Loop Id: 69 | Module: attention-clang-skl256 | Source: random.tcc:404-3558 [...] | Coverage: 0.02% |
|---|
| Loop Id: 69 | Module: attention-clang-skl256 | Source: random.tcc:404-3558 [...] | Coverage: 0.02% |
|---|
0x3a60 MOV %RCX,%RDX |
0x3a63 INC %RCX |
0x3a66 MOV %RCX,0x1730(%RSP) |
0x3a6e MOV 0x3b0(%RSP,%RDX,8),%RDX |
0x3a76 MOV %RDX,%RSI |
0x3a79 SHR $0xb,%RSI |
0x3a7d MOV %ESI,%ESI |
0x3a7f XOR %RDX,%RSI |
0x3a82 MOV %ESI,%EDX |
0x3a84 SAL $0x7,%EDX |
0x3a87 AND $-0x62d3a980,%EDX |
0x3a8d XOR %RSI,%RDX |
0x3a90 MOV %EDX,%ESI |
0x3a92 SAL $0xf,%ESI |
0x3a95 AND $-0x103a0000,%ESI |
0x3a9b XOR %RDX,%RSI |
0x3a9e MOV %RSI,%RDX |
0x3aa1 SHR $0x12,%RDX |
0x3aa5 XOR %ESI,%EDX |
0x3aa7 VCVTUSI2SS %EDX,%XMM15,%XMM0 |
0x3aad VMULSS 0x2553(%RIP),%XMM0,%XMM0 |
0x3ab5 VUCOMISS 0x254f(%RIP),%XMM0 |
0x3abd JB 3e25 |
0x3ac3 CMP $0x270,%RCX |
0x3aca JB 3a60 |
0x3acc VPBROADCASTQ 0x3b0(%RSP),%YMM0 |
0x3ad6 XOR %ECX,%ECX |
0x3ad8 VPBROADCASTQ 0x2557(%RIP),%YMM12 |
0x3ae1 VPBROADCASTQ 0x2556(%RIP),%YMM13 |
0x3aea VPBROADCASTQ 0x2555(%RIP),%YMM14 |
0x3af3 VPBROADCASTQ 0x2554(%RIP),%YMM15 |
0x3afc NOPL (%RAX) |
(70) 0x3b00 VMOVDQA %YMM0,%YMM1 |
(70) 0x3b04 VMOVDQU 0x3b8(%RSP,%RCX,8),%YMM2 |
(70) 0x3b0d VMOVDQU 0x3d8(%RSP,%RCX,8),%YMM3 |
(70) 0x3b16 VMOVDQU 0x3f8(%RSP,%RCX,8),%YMM4 |
(70) 0x3b1f VMOVDQU 0x418(%RSP,%RCX,8),%YMM0 |
(70) 0x3b28 VALIGNQ $0x3,%YMM1,%YMM2,%YMM1 |
(70) 0x3b2f VALIGNQ $0x3,%YMM2,%YMM3,%YMM5 |
(70) 0x3b36 VALIGNQ $0x3,%YMM3,%YMM4,%YMM6 |
(70) 0x3b3d VALIGNQ $0x3,%YMM4,%YMM0,%YMM7 |
(70) 0x3b44 VPAND %YMM2,%YMM13,%YMM8 |
(70) 0x3b48 VPAND %YMM3,%YMM13,%YMM9 |
(70) 0x3b4c VPAND %YMM4,%YMM13,%YMM10 |
(70) 0x3b50 VPAND %YMM0,%YMM13,%YMM11 |
(70) 0x3b54 VPTERNLOGQ $-0x8,%YMM12,%YMM1,%YMM8 |
(70) 0x3b5b VPTERNLOGQ $-0x8,%YMM12,%YMM5,%YMM9 |
(70) 0x3b62 VPTERNLOGQ $-0x8,%YMM12,%YMM6,%YMM10 |
(70) 0x3b69 VPTERNLOGQ $-0x8,%YMM12,%YMM7,%YMM11 |
(70) 0x3b70 VPSRLQ $0x1,%YMM8,%YMM1 |
(70) 0x3b76 VPSRLQ $0x1,%YMM9,%YMM5 |
(70) 0x3b7c VPSRLQ $0x1,%YMM10,%YMM6 |
(70) 0x3b82 VPSRLQ $0x1,%YMM11,%YMM7 |
(70) 0x3b88 VPXOR 0x1018(%RSP,%RCX,8),%YMM1,%YMM1 |
(70) 0x3b91 VPXOR 0x1038(%RSP,%RCX,8),%YMM5,%YMM5 |
(70) 0x3b9a VPXOR 0x1058(%RSP,%RCX,8),%YMM6,%YMM6 |
(70) 0x3ba3 VPXOR 0x1078(%RSP,%RCX,8),%YMM7,%YMM7 |
(70) 0x3bac VPTESTMQ %YMM14,%YMM2,%K1 |
(70) 0x3bb2 VPTESTMQ %YMM14,%YMM3,%K2 |
(70) 0x3bb8 VPTESTMQ %YMM14,%YMM4,%K3 |
(70) 0x3bbe VPTESTMQ %YMM14,%YMM0,%K4 |
(70) 0x3bc4 VPXORQ %YMM15,%YMM1,%YMM1{%K1} |
(70) 0x3bca VPXORQ %YMM15,%YMM5,%YMM5{%K2} |
(70) 0x3bd0 VPXORQ %YMM15,%YMM6,%YMM6{%K3} |
(70) 0x3bd6 VPXORQ %YMM15,%YMM7,%YMM7{%K4} |
(70) 0x3bdc VMOVDQU %YMM1,0x3b0(%RSP,%RCX,8) |
(70) 0x3be5 VMOVDQU %YMM5,0x3d0(%RSP,%RCX,8) |
(70) 0x3bee VMOVDQU %YMM6,0x3f0(%RSP,%RCX,8) |
(70) 0x3bf7 VMOVDQU %YMM7,0x410(%RSP,%RCX,8) |
(70) 0x3c00 ADD $0x10,%RCX |
(70) 0x3c04 CMP $0xe0,%RCX |
(70) 0x3c0b JNE 3b00 |
0x3c11 VEXTRACTI128 $0x1,%YMM0,%XMM0 |
0x3c17 VPEXTRQ $0x1,%XMM0,%RSI |
0x3c1d AND $-0x80000000,%RSI |
0x3c24 MOV 0xab8(%RSP),%RDX |
0x3c2c MOV 0xac0(%RSP),%RCX |
0x3c34 MOV %EDX,%EDI |
0x3c36 AND $0x7ffffffe,%EDI |
0x3c3c OR %RSI,%RDI |
0x3c3f SHR $0x1,%RDI |
0x3c42 XOR 0x1718(%RSP),%RDI |
0x3c4a MOV %EDX,%ESI |
0x3c4c AND $0x1,%ESI |
0x3c4f NEG %ESI |
0x3c51 MOV $-0x66f74f21,%R8D |
0x3c57 AND %R8D,%ESI |
0x3c5a XOR %RDI,%RSI |
0x3c5d MOV %RSI,0xab0(%RSP) |
0x3c65 AND $-0x80000000,%RDX |
0x3c6c MOV %ECX,%ESI |
0x3c6e AND $0x7ffffffe,%ESI |
0x3c74 OR %RDX,%RSI |
0x3c77 SHR $0x1,%RSI |
0x3c7a XOR 0x1720(%RSP),%RSI |
0x3c82 MOV %ECX,%EDX |
0x3c84 AND $0x1,%EDX |
0x3c87 NEG %EDX |
0x3c89 AND %R8D,%EDX |
0x3c8c XOR %RSI,%RDX |
0x3c8f MOV %RDX,0xab8(%RSP) |
0x3c97 AND $-0x80000000,%RCX |
0x3c9e MOV 0xac8(%RSP),%RDX |
0x3ca6 MOV %EDX,%ESI |
0x3ca8 VPBROADCASTQ %RDX,%XMM0 |
0x3cae AND $0x7ffffffe,%EDX |
0x3cb4 OR %RCX,%RDX |
0x3cb7 SHR $0x1,%RDX |
0x3cba XOR 0x1728(%RSP),%RDX |
0x3cc2 AND $0x1,%ESI |
0x3cc5 NEG %ESI |
0x3cc7 MOV $-0x66f74f21,%EDI |
0x3ccc AND %R8D,%ESI |
0x3ccf XOR %RDX,%RSI |
0x3cd2 MOV %RSI,0xac0(%RSP) |
0x3cda MOV $0xe8,%ECX |
0x3cdf VPBROADCASTQ 0x2350(%RIP),%XMM5 |
0x3ce8 VPBROADCASTQ 0x234f(%RIP),%XMM6 |
0x3cf1 VPBROADCASTQ 0x234e(%RIP),%XMM7 |
0x3cfa VPBROADCASTQ 0x234d(%RIP),%XMM8 |
0x3d03 NOPW %CS:(%RAX,%RAX,1) |
(71) 0x3d10 VMOVDQU 0x390(%RSP,%RCX,8),%XMM1 |
(71) 0x3d19 VMOVDQU 0x3a0(%RSP,%RCX,8),%XMM2 |
(71) 0x3d22 VPALIGNR $0x8,%XMM0,%XMM1,%XMM0 |
(71) 0x3d28 VMOVDQU 0x3b0(%RSP,%RCX,8),%XMM3 |
(71) 0x3d31 VPAND %XMM6,%XMM1,%XMM4 |
(71) 0x3d35 VPTERNLOGQ $-0x8,%XMM5,%XMM0,%XMM4 |
(71) 0x3d3c VPSRLQ $0x1,%XMM4,%XMM0 |
(71) 0x3d41 VPXOR -0x390(%RSP,%RCX,8),%XMM0,%XMM0 |
(71) 0x3d4a VPTESTMQ %XMM7,%XMM1,%K1 |
(71) 0x3d50 VPXORQ %XMM8,%XMM0,%XMM0{%K1} |
(71) 0x3d56 VMOVDQU %XMM0,0x388(%RSP,%RCX,8) |
(71) 0x3d5f VPALIGNR $0x8,%XMM1,%XMM2,%XMM0 |
(71) 0x3d65 VPAND %XMM6,%XMM2,%XMM1 |
(71) 0x3d69 VPTERNLOGQ $-0x8,%XMM5,%XMM0,%XMM1 |
(71) 0x3d70 VPSRLQ $0x1,%XMM1,%XMM0 |
(71) 0x3d75 VPXOR -0x380(%RSP,%RCX,8),%XMM0,%XMM0 |
(71) 0x3d7e VPTESTMQ %XMM7,%XMM2,%K1 |
(71) 0x3d84 VPXORQ %XMM8,%XMM0,%XMM0{%K1} |
(71) 0x3d8a VMOVDQU %XMM0,0x398(%RSP,%RCX,8) |
(71) 0x3d93 VPALIGNR $0x8,%XMM2,%XMM3,%XMM0 |
(71) 0x3d99 VPAND %XMM6,%XMM3,%XMM1 |
(71) 0x3d9d VPTERNLOGQ $-0x8,%XMM5,%XMM0,%XMM1 |
(71) 0x3da4 VPSRLQ $0x1,%XMM1,%XMM0 |
(71) 0x3da9 VPXOR -0x370(%RSP,%RCX,8),%XMM0,%XMM0 |
(71) 0x3db2 VPTESTMQ %XMM7,%XMM3,%K1 |
(71) 0x3db8 VPXORQ %XMM8,%XMM0,%XMM0{%K1} |
(71) 0x3dbe VMOVDQU %XMM0,0x3a8(%RSP,%RCX,8) |
(71) 0x3dc7 ADD $0x6,%RCX |
(71) 0x3dcb VMOVDQA %XMM3,%XMM0 |
(71) 0x3dcf CMP $0x274,%RCX |
(71) 0x3dd6 JNE 3d10 |
0x3ddc MOV 0x1728(%RSP),%RCX |
0x3de4 MOV $-0x80000000,%RDX |
0x3deb AND %RDX,%RCX |
0x3dee MOV 0x3b0(%RSP),%RDX |
0x3df6 MOV %EDX,%ESI |
0x3df8 AND $0x7ffffffe,%ESI |
0x3dfe OR %RCX,%RSI |
0x3e01 SHR $0x1,%RSI |
0x3e04 XOR 0x1010(%RSP),%RSI |
0x3e0c AND $0x1,%EDX |
0x3e0f NEG %EDX |
0x3e11 AND %EDI,%EDX |
0x3e13 XOR %RSI,%RDX |
0x3e16 MOV %RDX,0x1728(%RSP) |
0x3e1e XOR %ECX,%ECX |
0x3e20 JMP 3a60 |
/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/16.1.1/../../../../include/c++/16.1.1/bits/random.tcc: 404 - 3558 |
-------------------------------------------------------------------------------- |
404: for (size_t __k = 0; __k < (__n - __m); ++__k) |
405: { |
406: _UIntType __y = ((_M_x[__k] & __upper_mask) |
407: | (_M_x[__k + 1] & __lower_mask)); |
408: _M_x[__k] = (_M_x[__k + __m] ^ (__y >> 1) |
409: ^ ((__y & 0x01) ? __a : 0)); |
410: } |
411: |
412: for (size_t __k = (__n - __m); __k < (__n - 1); ++__k) |
413: { |
414: _UIntType __y = ((_M_x[__k] & __upper_mask) |
415: | (_M_x[__k + 1] & __lower_mask)); |
416: _M_x[__k] = (_M_x[__k + (__m - __n)] ^ (__y >> 1) |
417: ^ ((__y & 0x01) ? __a : 0)); |
418: } |
419: |
420: _UIntType __y = ((_M_x[__n - 1] & __upper_mask) |
421: | (_M_x[0] & __lower_mask)); |
422: _M_x[__n - 1] = (_M_x[__m - 1] ^ (__y >> 1) |
423: ^ ((__y & 0x01) ? __a : 0)); |
[...] |
458: if (_M_p >= state_size) |
459: _M_gen_rand(); |
460: |
461: // Calculate o(x(i)). |
462: result_type __z = _M_x[_M_p++]; |
463: __z ^= (__z >> __u) & __d; |
464: __z ^= (__z << __s) & __b; |
465: __z ^= (__z << __t) & __c; |
466: __z ^= (__z >> __l); |
[...] |
3557: const _RealT __ret = _RealT(__sum >> __log2_x) / _RealT(__rd); |
3558: if (__ret < _RealT(1.0)) |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | __libc_init_first | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | attention-clang-skl256 |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 3.71 |
| CQA speedup if FP arith vectorized | 2.32 |
| CQA speedup if fully vectorized | 12.68 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.57 |
| Bottlenecks | |
| Function | main |
| Source | random.tcc:404-409,random.tcc:412-412,random.tcc:420-423,random.tcc:458-458,random.tcc:462-466,random.tcc:3557-3558 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 15.75 |
| CQA cycles if no scalar integer | 4.25 |
| CQA cycles if FP arith vectorized | 6.79 |
| CQA cycles if fully vectorized | 1.24 |
| Front-end cycles | 15.75 |
| P0 cycles | 10.00 |
| P1 cycles | 10.00 |
| P2 cycles | 6.00 |
| P3 cycles | 6.00 |
| P4 cycles | 3.00 |
| P5 cycles | 10.00 |
| P6 cycles | 10.00 |
| P7 cycles | 3.00 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | 0 |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 61.00 |
| Nb uops | 63.00 |
| Nb loads | 12.00 |
| Nb stores | 3.00 |
| Nb stack references | 5.50 |
| FLOP/cycle | 0.06 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 1.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 5.85 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 88.00 |
| Bytes stored | 24.00 |
| Stride 0 | 0.00 |
| Stride 1 | 0.00 |
| Stride n | 0.00 |
| Stride unknown | 2.00 |
| Stride indirect | 0.00 |
| Vectorization ratio all | 0.89 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | NA |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 1.00 |
| Vector-efficiency ratio all | 9.51 |
| Vector-efficiency ratio load | 8.93 |
| Vector-efficiency ratio store | 12.50 |
| Vector-efficiency ratio mul | 6.25 |
| Vector-efficiency ratio add_sub | NA |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 9.38 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 3.85 |
| CQA speedup if FP arith vectorized | 2.16 |
| CQA speedup if fully vectorized | 12.22 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.59 |
| Bottlenecks | micro-operation queue, |
| Function | main |
| Source | random.tcc:404-409,random.tcc:412-412,random.tcc:420-423,random.tcc:458-458,random.tcc:462-466,random.tcc:3557-3558 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 25.00 |
| CQA cycles if no scalar integer | 6.50 |
| CQA cycles if FP arith vectorized | 11.59 |
| CQA cycles if fully vectorized | 2.05 |
| Front-end cycles | 25.00 |
| P0 cycles | 15.75 |
| P1 cycles | 15.75 |
| P2 cycles | 10.50 |
| P3 cycles | 10.50 |
| P4 cycles | 5.00 |
| P5 cycles | 15.75 |
| P6 cycles | 15.75 |
| P7 cycles | 5.00 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | 0 |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 97.00 |
| Nb uops | 100.00 |
| Nb loads | 21.00 |
| Nb stores | 5.00 |
| Nb stack references | 10.00 |
| FLOP/cycle | 0.04 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 1.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 8.00 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 160.00 |
| Bytes stored | 40.00 |
| Stride 0 | 0.00 |
| Stride 1 | 0.00 |
| Stride n | 0.00 |
| Stride unknown | 2.00 |
| Stride indirect | 0.00 |
| Vectorization ratio all | 1.79 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | NA |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 2.00 |
| Vector-efficiency ratio all | 10.16 |
| Vector-efficiency ratio load | 11.61 |
| Vector-efficiency ratio store | 12.50 |
| Vector-efficiency ratio mul | 6.25 |
| Vector-efficiency ratio add_sub | NA |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 10.00 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 3.25 |
| CQA speedup if FP arith vectorized | 3.27 |
| CQA speedup if fully vectorized | 14.81 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.53 |
| Bottlenecks | micro-operation queue, |
| Function | main |
| Source | random.tcc:404-409,random.tcc:412-412,random.tcc:420-423,random.tcc:458-458,random.tcc:462-466,random.tcc:3557-3558 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 6.50 |
| CQA cycles if no scalar integer | 2.00 |
| CQA cycles if FP arith vectorized | 1.99 |
| CQA cycles if fully vectorized | 0.44 |
| Front-end cycles | 6.50 |
| P0 cycles | 4.25 |
| P1 cycles | 4.25 |
| P2 cycles | 1.50 |
| P3 cycles | 1.50 |
| P4 cycles | 1.00 |
| P5 cycles | 4.25 |
| P6 cycles | 4.25 |
| P7 cycles | 1.00 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | 0 |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 25.00 |
| Nb uops | 26.00 |
| Nb loads | 3.00 |
| Nb stores | 1.00 |
| Nb stack references | 1.00 |
| FLOP/cycle | 0.15 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 1.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 3.69 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 16.00 |
| Bytes stored | 8.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 0.00 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | NA |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 0.00 |
| Vector-efficiency ratio all | 8.85 |
| Vector-efficiency ratio load | 6.25 |
| Vector-efficiency ratio store | 12.50 |
| Vector-efficiency ratio mul | 6.25 |
| Vector-efficiency ratio add_sub | NA |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 8.75 |
| Path / |
| Function | main |
| Source file and lines | random.tcc:404-3558 |
| Module | attention-clang-skl256 |
| nb instructions | 61 |
| nb uops | 63 |
| loop length | 298 |
| used x86 registers | 5.50 |
| used mmx registers | 0 |
| used xmm registers | 4 |
| used ymm registers | 2.50 |
| used zmm registers | 0 |
| nb stack references | 5.50 |
| micro-operation queue | 15.75 cycles |
| front end | 15.75 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
|---|---|---|---|---|---|---|---|---|
| uops | 10.00 | 10.00 | 6.00 | 6.00 | 3.00 | 10.00 | 10.00 | 3.00 |
| cycles | 10.00 | 10.00 | 6.00 | 6.00 | 3.00 | 10.00 | 10.00 | 3.00 |
| Cycles executing div or sqrt instructions | NA |
| Longest recurrence chain latency (RecMII) | 0.00 |
| Front-end | 15.75 |
| Dispatch | 10.00 |
| Data deps. | 0.00 |
| Overall L1 | 15.75 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 1% |
| all | 0% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 0% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 1% |
| all | 9% |
| load | 12% |
| store | 12% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 9% |
| all | 6% |
| load | 6% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 6% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 6% |
| all | 9% |
| load | 8% |
| store | 12% |
| mul | 6% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 9% |
| Function | main |
| Source file and lines | random.tcc:404-3558 |
| Module | attention-clang-skl256 |
| nb instructions | 97 |
| nb uops | 100 |
| loop length | 488 |
| used x86 registers | 7 |
| used mmx registers | 0 |
| used xmm registers | 6 |
| used ymm registers | 5 |
| used zmm registers | 0 |
| nb stack references | 10 |
| micro-operation queue | 25.00 cycles |
| front end | 25.00 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
|---|---|---|---|---|---|---|---|---|
| uops | 15.75 | 15.75 | 10.50 | 10.50 | 5.00 | 15.75 | 15.75 | 5.00 |
| cycles | 15.75 | 15.75 | 10.50 | 10.50 | 5.00 | 15.75 | 15.75 | 5.00 |
| Cycles executing div or sqrt instructions | NA |
| Longest recurrence chain latency (RecMII) | 0.00 |
| Front-end | 25.00 |
| Dispatch | 15.75 |
| Data deps. | 0.00 |
| Overall L1 | 25.00 |
| all | 1% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 2% |
| all | 0% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 0% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 1% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 2% |
| all | 10% |
| load | 12% |
| store | 12% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 10% |
| all | 6% |
| load | 6% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 6% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 6% |
| all | 10% |
| load | 11% |
| store | 12% |
| mul | 6% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 10% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOV %RCX,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| INC %RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %RCX,0x1730(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV 0x3b0(%RSP,%RDX,8),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV %RDX,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| SHR $0xb,%RSI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
| MOV %ESI,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| XOR %RDX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %ESI,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| SAL $0x7,%EDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| AND $-0x62d3a980,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| XOR %RSI,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %EDX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| SAL $0xf,%ESI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (6.3%) |
| AND $-0x103a0000,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| XOR %RDX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RSI,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| SHR $0x12,%RDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| XOR %ESI,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| VCVTUSI2SS %EDX,%XMM15,%XMM0 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 6 | 1 | scal (6.3%) |
| VMULSS 0x2553(%RIP),%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| VUCOMISS 0x254f(%RIP),%XMM0 | 2 | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 | scal (6.3%) |
| JB 3e25 <main+0x1945> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| CMP $0x270,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| JB 3a60 <main+0x1580> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| VPBROADCASTQ 0x3b0(%RSP),%YMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VPBROADCASTQ 0x2557(%RIP),%YMM12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| VPBROADCASTQ 0x2556(%RIP),%YMM13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| VPBROADCASTQ 0x2555(%RIP),%YMM14 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| VPBROADCASTQ 0x2554(%RIP),%YMM15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VEXTRACTI128 $0x1,%YMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VPEXTRQ $0x1,%XMM0,%RSI | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (12.5%) |
| AND $-0x80000000,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0xab8(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV 0xac0(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV %EDX,%EDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| AND $0x7ffffffe,%EDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| OR %RSI,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| SHR $0x1,%RDI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
| XOR 0x1718(%RSP),%RDI | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| MOV %EDX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| AND $0x1,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| NEG %ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| MOV $-0x66f74f21,%R8D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| AND %R8D,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| XOR %RDI,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RSI,0xab0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| AND $-0x80000000,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %ECX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| AND $0x7ffffffe,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| OR %RDX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| SHR $0x1,%RSI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
| XOR 0x1720(%RSP),%RSI | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| MOV %ECX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| AND $0x1,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| NEG %EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| AND %R8D,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| XOR %RSI,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %RDX,0xab8(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| AND $-0x80000000,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0xac8(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV %EDX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| VPBROADCASTQ %RDX,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| AND $0x7ffffffe,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| OR %RCX,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| SHR $0x1,%RDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| XOR 0x1728(%RSP),%RDX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| AND $0x1,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| NEG %ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| MOV $-0x66f74f21,%EDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| AND %R8D,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| XOR %RDX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RSI,0xac0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV $0xe8,%ECX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| VPBROADCASTQ 0x2350(%RIP),%XMM5 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| VPBROADCASTQ 0x234f(%RIP),%XMM6 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| VPBROADCASTQ 0x234e(%RIP),%XMM7 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| VPBROADCASTQ 0x234d(%RIP),%XMM8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV 0x1728(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV $-0x80000000,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| AND %RDX,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0x3b0(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV %EDX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| AND $0x7ffffffe,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| OR %RCX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| SHR $0x1,%RSI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
| XOR 0x1010(%RSP),%RSI | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| AND $0x1,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| NEG %EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| AND %EDI,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| XOR %RSI,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %RDX,0x1728(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| JMP 3a60 <main+0x1580> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| Function | main |
| Source file and lines | random.tcc:404-3558 |
| Module | attention-clang-skl256 |
| nb instructions | 25 |
| nb uops | 26 |
| loop length | 108 |
| used x86 registers | 4 |
| used mmx registers | 0 |
| used xmm registers | 2 |
| used ymm registers | 0 |
| used zmm registers | 0 |
| nb stack references | 1 |
| micro-operation queue | 6.50 cycles |
| front end | 6.50 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
|---|---|---|---|---|---|---|---|---|
| uops | 4.25 | 4.25 | 1.50 | 1.50 | 1.00 | 4.25 | 4.25 | 1.00 |
| cycles | 4.25 | 4.25 | 1.50 | 1.50 | 1.00 | 4.25 | 4.25 | 1.00 |
| Cycles executing div or sqrt instructions | NA |
| Longest recurrence chain latency (RecMII) | 0.00 |
| Front-end | 6.50 |
| Dispatch | 4.25 |
| Data deps. | 0.00 |
| Overall L1 | 6.50 |
| all | 0% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 0% |
| all | 0% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 0% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 9% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | 12% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 9% |
| all | 6% |
| load | 6% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 6% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 6% |
| all | 8% |
| load | 6% |
| store | 12% |
| mul | 6% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 8% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOV %RCX,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| INC %RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %RCX,0x1730(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV 0x3b0(%RSP,%RDX,8),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV %RDX,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| SHR $0xb,%RSI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
| MOV %ESI,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| XOR %RDX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %ESI,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| SAL $0x7,%EDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| AND $-0x62d3a980,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| XOR %RSI,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %EDX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| SAL $0xf,%ESI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (6.3%) |
| AND $-0x103a0000,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| XOR %RDX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RSI,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| SHR $0x12,%RDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| XOR %ESI,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| VCVTUSI2SS %EDX,%XMM15,%XMM0 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 6 | 1 | scal (6.3%) |
| VMULSS 0x2553(%RIP),%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| VUCOMISS 0x254f(%RIP),%XMM0 | 2 | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 | scal (6.3%) |
| JB 3e25 <main+0x1945> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| CMP $0x270,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| JB 3a60 <main+0x1580> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
