| Loop Id: 73 | Module: attention-clang-skl512 | Source: random.tcc:404-3558 [...] | Coverage: 0.16% |
|---|
| Loop Id: 73 | Module: attention-clang-skl512 | Source: random.tcc:404-3558 [...] | Coverage: 0.16% |
|---|
0x3780 MOV %RCX,%RDX |
0x3783 INC %RCX |
0x3786 MOV %RCX,0x1770(%RSP) |
0x378e MOV 0x3f0(%RSP,%RDX,8),%RDX |
0x3796 MOV %RDX,%RSI |
0x3799 SHR $0xb,%RSI |
0x379d MOV %ESI,%ESI |
0x379f XOR %RDX,%RSI |
0x37a2 MOV %ESI,%EDX |
0x37a4 SAL $0x7,%EDX |
0x37a7 AND $-0x62d3a980,%EDX |
0x37ad XOR %RSI,%RDX |
0x37b0 MOV %EDX,%ESI |
0x37b2 SAL $0xf,%ESI |
0x37b5 AND $-0x103a0000,%ESI |
0x37bb XOR %RDX,%RSI |
0x37be MOV %RSI,%RDX |
0x37c1 SHR $0x12,%RDX |
0x37c5 XOR %ESI,%EDX |
0x37c7 VCVTUSI2SS %EDX,%XMM15,%XMM0 |
0x37cd VMULSS 0x2833(%RIP),%XMM0,%XMM0 |
0x37d5 VUCOMISS 0x282f(%RIP),%XMM0 |
0x37dd JB 3b65 |
0x37e3 CMP $0x270,%RCX |
0x37ea JB 3780 |
0x37ec VPBROADCASTQ 0x3f0(%RSP),%ZMM0 |
0x37f4 XOR %ECX,%ECX |
0x37f6 VPBROADCASTQ 0x2838(%RIP),%ZMM12 |
0x3800 VPBROADCASTQ 0x2836(%RIP),%ZMM13 |
0x380a VPBROADCASTQ 0x2834(%RIP),%ZMM14 |
0x3814 VPBROADCASTQ 0x2832(%RIP),%ZMM15 |
0x381e XCHG %AX,%AX |
(74) 0x3820 VMOVDQA64 %ZMM0,%ZMM1 |
(74) 0x3826 VMOVDQU64 0x3f8(%RSP,%RCX,8),%ZMM2 |
(74) 0x3831 VMOVDQU64 0x438(%RSP,%RCX,8),%ZMM3 |
(74) 0x383c VMOVDQU64 0x478(%RSP,%RCX,8),%ZMM4 |
(74) 0x3847 VMOVDQU64 0x4b8(%RSP,%RCX,8),%ZMM0 |
(74) 0x3852 VALIGNQ $0x7,%ZMM1,%ZMM2,%ZMM1 |
(74) 0x3859 VALIGNQ $0x7,%ZMM2,%ZMM3,%ZMM5 |
(74) 0x3860 VALIGNQ $0x7,%ZMM3,%ZMM4,%ZMM6 |
(74) 0x3867 VALIGNQ $0x7,%ZMM4,%ZMM0,%ZMM7 |
(74) 0x386e VPANDQ %ZMM13,%ZMM2,%ZMM8 |
(74) 0x3874 VPANDQ %ZMM13,%ZMM3,%ZMM9 |
(74) 0x387a VPANDQ %ZMM13,%ZMM4,%ZMM10 |
(74) 0x3880 VPANDQ %ZMM13,%ZMM0,%ZMM11 |
(74) 0x3886 VPTERNLOGQ $-0x8,%ZMM12,%ZMM1,%ZMM8 |
(74) 0x388d VPTERNLOGQ $-0x8,%ZMM12,%ZMM5,%ZMM9 |
(74) 0x3894 VPTERNLOGQ $-0x8,%ZMM12,%ZMM6,%ZMM10 |
(74) 0x389b VPTERNLOGQ $-0x8,%ZMM12,%ZMM7,%ZMM11 |
(74) 0x38a2 VPSRLQ $0x1,%ZMM8,%ZMM1 |
(74) 0x38a9 VPSRLQ $0x1,%ZMM9,%ZMM5 |
(74) 0x38b0 VPSRLQ $0x1,%ZMM10,%ZMM6 |
(74) 0x38b7 VPSRLQ $0x1,%ZMM11,%ZMM7 |
(74) 0x38be VPXORQ 0x1058(%RSP,%RCX,8),%ZMM1,%ZMM1 |
(74) 0x38c9 VPXORQ 0x1098(%RSP,%RCX,8),%ZMM5,%ZMM5 |
(74) 0x38d4 VPXORQ 0x10d8(%RSP,%RCX,8),%ZMM6,%ZMM6 |
(74) 0x38df VPXORQ 0x1118(%RSP,%RCX,8),%ZMM7,%ZMM7 |
(74) 0x38ea VPTESTMQ %ZMM14,%ZMM2,%K1 |
(74) 0x38f0 VPTESTMQ %ZMM14,%ZMM3,%K2 |
(74) 0x38f6 VPTESTMQ %ZMM14,%ZMM4,%K3 |
(74) 0x38fc VPTESTMQ %ZMM14,%ZMM0,%K4 |
(74) 0x3902 VPXORQ %ZMM15,%ZMM1,%ZMM1{%K1} |
(74) 0x3908 VPXORQ %ZMM15,%ZMM5,%ZMM5{%K2} |
(74) 0x390e VPXORQ %ZMM15,%ZMM6,%ZMM6{%K3} |
(74) 0x3914 VPXORQ %ZMM15,%ZMM7,%ZMM7{%K4} |
(74) 0x391a VMOVDQU64 %ZMM1,0x3f0(%RSP,%RCX,8) |
(74) 0x3925 VMOVDQU64 %ZMM5,0x430(%RSP,%RCX,8) |
(74) 0x3930 VMOVDQU64 %ZMM6,0x470(%RSP,%RCX,8) |
(74) 0x393b VMOVDQU64 %ZMM7,0x4b0(%RSP,%RCX,8) |
(74) 0x3946 ADD $0x20,%RCX |
(74) 0x394a CMP $0xe0,%RCX |
(74) 0x3951 JNE 3820 |
0x3957 VEXTRACTI32X4 $0x3,%ZMM0,%XMM0 |
0x395e VPEXTRQ $0x1,%XMM0,%RSI |
0x3964 AND $-0x80000000,%RSI |
0x396b MOV 0xaf8(%RSP),%RDX |
0x3973 MOV 0xb00(%RSP),%RCX |
0x397b MOV %EDX,%EDI |
0x397d AND $0x7ffffffe,%EDI |
0x3983 OR %RSI,%RDI |
0x3986 SHR $0x1,%RDI |
0x3989 XOR 0x1758(%RSP),%RDI |
0x3991 MOV %EDX,%ESI |
0x3993 AND $0x1,%ESI |
0x3996 NEG %ESI |
0x3998 MOV $-0x66f74f21,%R8D |
0x399e AND %R8D,%ESI |
0x39a1 XOR %RDI,%RSI |
0x39a4 MOV %RSI,0xaf0(%RSP) |
0x39ac AND $-0x80000000,%RDX |
0x39b3 MOV %ECX,%ESI |
0x39b5 AND $0x7ffffffe,%ESI |
0x39bb OR %RDX,%RSI |
0x39be SHR $0x1,%RSI |
0x39c1 XOR 0x1760(%RSP),%RSI |
0x39c9 MOV %ECX,%EDX |
0x39cb AND $0x1,%EDX |
0x39ce NEG %EDX |
0x39d0 AND %R8D,%EDX |
0x39d3 XOR %RSI,%RDX |
0x39d6 MOV %RDX,0xaf8(%RSP) |
0x39de AND $-0x80000000,%RCX |
0x39e5 MOV 0xb08(%RSP),%RDX |
0x39ed MOV %EDX,%ESI |
0x39ef VPBROADCASTQ %RDX,%XMM0 |
0x39f5 AND $0x7ffffffe,%EDX |
0x39fb OR %RCX,%RDX |
0x39fe SHR $0x1,%RDX |
0x3a01 XOR 0x1768(%RSP),%RDX |
0x3a09 AND $0x1,%ESI |
0x3a0c NEG %ESI |
0x3a0e MOV $-0x66f74f21,%EDI |
0x3a13 AND %R8D,%ESI |
0x3a16 XOR %RDX,%RSI |
0x3a19 MOV %RSI,0xb00(%RSP) |
0x3a21 MOV $0xe8,%ECX |
0x3a26 VPBROADCASTQ 0x2609(%RIP),%XMM5 |
0x3a2f VPBROADCASTQ 0x2608(%RIP),%XMM6 |
0x3a38 VPBROADCASTQ 0x2607(%RIP),%XMM7 |
0x3a41 VPBROADCASTQ 0x2606(%RIP),%XMM8 |
0x3a4a NOPW (%RAX,%RAX,1) |
(75) 0x3a50 VMOVDQU 0x3d0(%RSP,%RCX,8),%XMM1 |
(75) 0x3a59 VMOVDQU 0x3e0(%RSP,%RCX,8),%XMM2 |
(75) 0x3a62 VPALIGNR $0x8,%XMM0,%XMM1,%XMM0 |
(75) 0x3a68 VMOVDQU 0x3f0(%RSP,%RCX,8),%XMM3 |
(75) 0x3a71 VPAND %XMM6,%XMM1,%XMM4 |
(75) 0x3a75 VPTERNLOGQ $-0x8,%XMM5,%XMM0,%XMM4 |
(75) 0x3a7c VPSRLQ $0x1,%XMM4,%XMM0 |
(75) 0x3a81 VPXOR -0x350(%RSP,%RCX,8),%XMM0,%XMM0 |
(75) 0x3a8a VPTESTMQ %XMM7,%XMM1,%K1 |
(75) 0x3a90 VPXORQ %XMM8,%XMM0,%XMM0{%K1} |
(75) 0x3a96 VMOVDQU %XMM0,0x3c8(%RSP,%RCX,8) |
(75) 0x3a9f VPALIGNR $0x8,%XMM1,%XMM2,%XMM0 |
(75) 0x3aa5 VPAND %XMM6,%XMM2,%XMM1 |
(75) 0x3aa9 VPTERNLOGQ $-0x8,%XMM5,%XMM0,%XMM1 |
(75) 0x3ab0 VPSRLQ $0x1,%XMM1,%XMM0 |
(75) 0x3ab5 VPXOR -0x340(%RSP,%RCX,8),%XMM0,%XMM0 |
(75) 0x3abe VPTESTMQ %XMM7,%XMM2,%K1 |
(75) 0x3ac4 VPXORQ %XMM8,%XMM0,%XMM0{%K1} |
(75) 0x3aca VMOVDQU %XMM0,0x3d8(%RSP,%RCX,8) |
(75) 0x3ad3 VPALIGNR $0x8,%XMM2,%XMM3,%XMM0 |
(75) 0x3ad9 VPAND %XMM6,%XMM3,%XMM1 |
(75) 0x3add VPTERNLOGQ $-0x8,%XMM5,%XMM0,%XMM1 |
(75) 0x3ae4 VPSRLQ $0x1,%XMM1,%XMM0 |
(75) 0x3ae9 VPXOR -0x330(%RSP,%RCX,8),%XMM0,%XMM0 |
(75) 0x3af2 VPTESTMQ %XMM7,%XMM3,%K1 |
(75) 0x3af8 VPXORQ %XMM8,%XMM0,%XMM0{%K1} |
(75) 0x3afe VMOVDQU %XMM0,0x3e8(%RSP,%RCX,8) |
(75) 0x3b07 ADD $0x6,%RCX |
(75) 0x3b0b VMOVDQA %XMM3,%XMM0 |
(75) 0x3b0f CMP $0x274,%RCX |
(75) 0x3b16 JNE 3a50 |
0x3b1c MOV 0x1768(%RSP),%RCX |
0x3b24 MOV $-0x80000000,%RDX |
0x3b2b AND %RDX,%RCX |
0x3b2e MOV 0x3f0(%RSP),%RDX |
0x3b36 MOV %EDX,%ESI |
0x3b38 AND $0x7ffffffe,%ESI |
0x3b3e OR %RCX,%RSI |
0x3b41 SHR $0x1,%RSI |
0x3b44 XOR 0x1050(%RSP),%RSI |
0x3b4c AND $0x1,%EDX |
0x3b4f NEG %EDX |
0x3b51 AND %EDI,%EDX |
0x3b53 XOR %RSI,%RDX |
0x3b56 MOV %RDX,0x1768(%RSP) |
0x3b5e XOR %ECX,%ECX |
0x3b60 JMP 3780 |
/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/16.1.1/../../../../include/c++/16.1.1/bits/random.tcc: 404 - 3558 |
-------------------------------------------------------------------------------- |
404: for (size_t __k = 0; __k < (__n - __m); ++__k) |
405: { |
406: _UIntType __y = ((_M_x[__k] & __upper_mask) |
407: | (_M_x[__k + 1] & __lower_mask)); |
408: _M_x[__k] = (_M_x[__k + __m] ^ (__y >> 1) |
409: ^ ((__y & 0x01) ? __a : 0)); |
410: } |
411: |
412: for (size_t __k = (__n - __m); __k < (__n - 1); ++__k) |
413: { |
414: _UIntType __y = ((_M_x[__k] & __upper_mask) |
415: | (_M_x[__k + 1] & __lower_mask)); |
416: _M_x[__k] = (_M_x[__k + (__m - __n)] ^ (__y >> 1) |
417: ^ ((__y & 0x01) ? __a : 0)); |
418: } |
419: |
420: _UIntType __y = ((_M_x[__n - 1] & __upper_mask) |
421: | (_M_x[0] & __lower_mask)); |
422: _M_x[__n - 1] = (_M_x[__m - 1] ^ (__y >> 1) |
423: ^ ((__y & 0x01) ? __a : 0)); |
[...] |
458: if (_M_p >= state_size) |
459: _M_gen_rand(); |
460: |
461: // Calculate o(x(i)). |
462: result_type __z = _M_x[_M_p++]; |
463: __z ^= (__z >> __u) & __d; |
464: __z ^= (__z << __s) & __b; |
465: __z ^= (__z << __t) & __c; |
466: __z ^= (__z >> __l); |
[...] |
3557: const _RealT __ret = _RealT(__sum >> __log2_x) / _RealT(__rd); |
3558: if (__ret < _RealT(1.0)) |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | __libc_init_first | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | attention-clang-skl512 |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 3.71 |
| CQA speedup if FP arith vectorized | 2.32 |
| CQA speedup if fully vectorized | 12.68 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.57 |
| Bottlenecks | |
| Function | main |
| Source | random.tcc:404-409,random.tcc:412-412,random.tcc:420-423,random.tcc:458-458,random.tcc:462-466,random.tcc:3557-3558 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 15.75 |
| CQA cycles if no scalar integer | 4.25 |
| CQA cycles if FP arith vectorized | 6.79 |
| CQA cycles if fully vectorized | 1.24 |
| Front-end cycles | 15.75 |
| P0 cycles | 10.00 |
| P1 cycles | 10.00 |
| P2 cycles | 6.00 |
| P3 cycles | 6.00 |
| P4 cycles | 3.00 |
| P5 cycles | 10.00 |
| P6 cycles | 10.00 |
| P7 cycles | 3.00 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | 0 |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 61.00 |
| Nb uops | 63.00 |
| Nb loads | 12.00 |
| Nb stores | 3.00 |
| Nb stack references | 5.50 |
| FLOP/cycle | 0.06 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 1.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 5.85 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 88.00 |
| Bytes stored | 24.00 |
| Stride 0 | 0.00 |
| Stride 1 | 0.00 |
| Stride n | 0.00 |
| Stride unknown | 2.00 |
| Stride indirect | 0.00 |
| Vectorization ratio all | 0.89 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | NA |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 1.00 |
| Vector-efficiency ratio all | 9.51 |
| Vector-efficiency ratio load | 8.93 |
| Vector-efficiency ratio store | 12.50 |
| Vector-efficiency ratio mul | 6.25 |
| Vector-efficiency ratio add_sub | NA |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 9.38 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 3.85 |
| CQA speedup if FP arith vectorized | 2.16 |
| CQA speedup if fully vectorized | 12.22 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.59 |
| Bottlenecks | micro-operation queue, |
| Function | main |
| Source | random.tcc:404-409,random.tcc:412-412,random.tcc:420-423,random.tcc:458-458,random.tcc:462-466,random.tcc:3557-3558 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 25.00 |
| CQA cycles if no scalar integer | 6.50 |
| CQA cycles if FP arith vectorized | 11.59 |
| CQA cycles if fully vectorized | 2.05 |
| Front-end cycles | 25.00 |
| P0 cycles | 15.75 |
| P1 cycles | 15.75 |
| P2 cycles | 10.50 |
| P3 cycles | 10.50 |
| P4 cycles | 5.00 |
| P5 cycles | 15.75 |
| P6 cycles | 15.75 |
| P7 cycles | 5.00 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | 0 |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 97.00 |
| Nb uops | 100.00 |
| Nb loads | 21.00 |
| Nb stores | 5.00 |
| Nb stack references | 10.00 |
| FLOP/cycle | 0.04 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 1.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 8.00 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 160.00 |
| Bytes stored | 40.00 |
| Stride 0 | 0.00 |
| Stride 1 | 0.00 |
| Stride n | 0.00 |
| Stride unknown | 2.00 |
| Stride indirect | 0.00 |
| Vectorization ratio all | 1.79 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | NA |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 2.00 |
| Vector-efficiency ratio all | 10.16 |
| Vector-efficiency ratio load | 11.61 |
| Vector-efficiency ratio store | 12.50 |
| Vector-efficiency ratio mul | 6.25 |
| Vector-efficiency ratio add_sub | NA |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 10.00 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 3.25 |
| CQA speedup if FP arith vectorized | 3.27 |
| CQA speedup if fully vectorized | 14.81 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.53 |
| Bottlenecks | micro-operation queue, |
| Function | main |
| Source | random.tcc:404-409,random.tcc:412-412,random.tcc:420-423,random.tcc:458-458,random.tcc:462-466,random.tcc:3557-3558 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 6.50 |
| CQA cycles if no scalar integer | 2.00 |
| CQA cycles if FP arith vectorized | 1.99 |
| CQA cycles if fully vectorized | 0.44 |
| Front-end cycles | 6.50 |
| P0 cycles | 4.25 |
| P1 cycles | 4.25 |
| P2 cycles | 1.50 |
| P3 cycles | 1.50 |
| P4 cycles | 1.00 |
| P5 cycles | 4.25 |
| P6 cycles | 4.25 |
| P7 cycles | 1.00 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | 0 |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 25.00 |
| Nb uops | 26.00 |
| Nb loads | 3.00 |
| Nb stores | 1.00 |
| Nb stack references | 1.00 |
| FLOP/cycle | 0.15 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 1.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 3.69 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 16.00 |
| Bytes stored | 8.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 0.00 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | NA |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 0.00 |
| Vector-efficiency ratio all | 8.85 |
| Vector-efficiency ratio load | 6.25 |
| Vector-efficiency ratio store | 12.50 |
| Vector-efficiency ratio mul | 6.25 |
| Vector-efficiency ratio add_sub | NA |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 8.75 |
| Path / |
| Function | main |
| Source file and lines | random.tcc:404-3558 |
| Module | attention-clang-skl512 |
| nb instructions | 61 |
| nb uops | 63 |
| loop length | 295 |
| used x86 registers | 5.50 |
| used mmx registers | 0 |
| used xmm registers | 4 |
| used ymm registers | 0 |
| used zmm registers | 2.50 |
| nb stack references | 5.50 |
| micro-operation queue | 15.75 cycles |
| front end | 15.75 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
|---|---|---|---|---|---|---|---|---|
| uops | 10.00 | 10.00 | 6.00 | 6.00 | 3.00 | 10.00 | 10.00 | 3.00 |
| cycles | 10.00 | 10.00 | 6.00 | 6.00 | 3.00 | 10.00 | 10.00 | 3.00 |
| Cycles executing div or sqrt instructions | NA |
| Longest recurrence chain latency (RecMII) | 0.00 |
| Front-end | 15.75 |
| Dispatch | 10.00 |
| Data deps. | 0.00 |
| Overall L1 | 15.75 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 1% |
| all | 0% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 0% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 1% |
| all | 9% |
| load | 12% |
| store | 12% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 9% |
| all | 6% |
| load | 6% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 6% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 6% |
| all | 9% |
| load | 8% |
| store | 12% |
| mul | 6% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 9% |
| Function | main |
| Source file and lines | random.tcc:404-3558 |
| Module | attention-clang-skl512 |
| nb instructions | 97 |
| nb uops | 100 |
| loop length | 482 |
| used x86 registers | 7 |
| used mmx registers | 0 |
| used xmm registers | 6 |
| used ymm registers | 0 |
| used zmm registers | 5 |
| nb stack references | 10 |
| micro-operation queue | 25.00 cycles |
| front end | 25.00 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
|---|---|---|---|---|---|---|---|---|
| uops | 15.75 | 15.75 | 10.50 | 10.50 | 5.00 | 15.75 | 15.75 | 5.00 |
| cycles | 15.75 | 15.75 | 10.50 | 10.50 | 5.00 | 15.75 | 15.75 | 5.00 |
| Cycles executing div or sqrt instructions | NA |
| Longest recurrence chain latency (RecMII) | 0.00 |
| Front-end | 25.00 |
| Dispatch | 15.75 |
| Data deps. | 0.00 |
| Overall L1 | 25.00 |
| all | 1% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 2% |
| all | 0% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 0% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 1% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 2% |
| all | 10% |
| load | 12% |
| store | 12% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 10% |
| all | 6% |
| load | 6% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 6% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 6% |
| all | 10% |
| load | 11% |
| store | 12% |
| mul | 6% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 10% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOV %RCX,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| INC %RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %RCX,0x1770(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV 0x3f0(%RSP,%RDX,8),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV %RDX,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| SHR $0xb,%RSI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
| MOV %ESI,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| XOR %RDX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %ESI,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| SAL $0x7,%EDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| AND $-0x62d3a980,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| XOR %RSI,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %EDX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| SAL $0xf,%ESI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (6.3%) |
| AND $-0x103a0000,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| XOR %RDX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RSI,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| SHR $0x12,%RDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| XOR %ESI,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| VCVTUSI2SS %EDX,%XMM15,%XMM0 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 6 | 1 | scal (6.3%) |
| VMULSS 0x2833(%RIP),%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| VUCOMISS 0x282f(%RIP),%XMM0 | 2 | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 | scal (6.3%) |
| JB 3b65 <main+0x15c5> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| CMP $0x270,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| JB 3780 <main+0x11e0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| VPBROADCASTQ 0x3f0(%RSP),%ZMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VPBROADCASTQ 0x2838(%RIP),%ZMM12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| VPBROADCASTQ 0x2836(%RIP),%ZMM13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| VPBROADCASTQ 0x2834(%RIP),%ZMM14 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| VPBROADCASTQ 0x2832(%RIP),%ZMM15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| XCHG %AX,%AX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VEXTRACTI32X4 $0x3,%ZMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VPEXTRQ $0x1,%XMM0,%RSI | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (12.5%) |
| AND $-0x80000000,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0xaf8(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV 0xb00(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV %EDX,%EDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| AND $0x7ffffffe,%EDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| OR %RSI,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| SHR $0x1,%RDI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
| XOR 0x1758(%RSP),%RDI | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| MOV %EDX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| AND $0x1,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| NEG %ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| MOV $-0x66f74f21,%R8D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| AND %R8D,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| XOR %RDI,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RSI,0xaf0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| AND $-0x80000000,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %ECX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| AND $0x7ffffffe,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| OR %RDX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| SHR $0x1,%RSI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
| XOR 0x1760(%RSP),%RSI | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| MOV %ECX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| AND $0x1,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| NEG %EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| AND %R8D,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| XOR %RSI,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %RDX,0xaf8(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| AND $-0x80000000,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0xb08(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV %EDX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| VPBROADCASTQ %RDX,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| AND $0x7ffffffe,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| OR %RCX,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| SHR $0x1,%RDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| XOR 0x1768(%RSP),%RDX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| AND $0x1,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| NEG %ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| MOV $-0x66f74f21,%EDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| AND %R8D,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| XOR %RDX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RSI,0xb00(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV $0xe8,%ECX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| VPBROADCASTQ 0x2609(%RIP),%XMM5 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| VPBROADCASTQ 0x2608(%RIP),%XMM6 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| VPBROADCASTQ 0x2607(%RIP),%XMM7 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| VPBROADCASTQ 0x2606(%RIP),%XMM8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV 0x1768(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV $-0x80000000,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| AND %RDX,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV 0x3f0(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV %EDX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| AND $0x7ffffffe,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| OR %RCX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| SHR $0x1,%RSI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
| XOR 0x1050(%RSP),%RSI | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| AND $0x1,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| NEG %EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| AND %EDI,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| XOR %RSI,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %RDX,0x1768(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| JMP 3780 <main+0x11e0> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| Function | main |
| Source file and lines | random.tcc:404-3558 |
| Module | attention-clang-skl512 |
| nb instructions | 25 |
| nb uops | 26 |
| loop length | 108 |
| used x86 registers | 4 |
| used mmx registers | 0 |
| used xmm registers | 2 |
| used ymm registers | 0 |
| used zmm registers | 0 |
| nb stack references | 1 |
| micro-operation queue | 6.50 cycles |
| front end | 6.50 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
|---|---|---|---|---|---|---|---|---|
| uops | 4.25 | 4.25 | 1.50 | 1.50 | 1.00 | 4.25 | 4.25 | 1.00 |
| cycles | 4.25 | 4.25 | 1.50 | 1.50 | 1.00 | 4.25 | 4.25 | 1.00 |
| Cycles executing div or sqrt instructions | NA |
| Longest recurrence chain latency (RecMII) | 0.00 |
| Front-end | 6.50 |
| Dispatch | 4.25 |
| Data deps. | 0.00 |
| Overall L1 | 6.50 |
| all | 0% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 0% |
| all | 0% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 0% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 9% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | 12% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 9% |
| all | 6% |
| load | 6% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 6% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 6% |
| all | 8% |
| load | 6% |
| store | 12% |
| mul | 6% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 8% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOV %RCX,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| INC %RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %RCX,0x1770(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV 0x3f0(%RSP,%RDX,8),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV %RDX,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| SHR $0xb,%RSI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
| MOV %ESI,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| XOR %RDX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %ESI,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| SAL $0x7,%EDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| AND $-0x62d3a980,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| XOR %RSI,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %EDX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| SAL $0xf,%ESI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (6.3%) |
| AND $-0x103a0000,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| XOR %RDX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RSI,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| SHR $0x12,%RDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| XOR %ESI,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| VCVTUSI2SS %EDX,%XMM15,%XMM0 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 6 | 1 | scal (6.3%) |
| VMULSS 0x2833(%RIP),%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| VUCOMISS 0x282f(%RIP),%XMM0 | 2 | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 | scal (6.3%) |
| JB 3b65 <main+0x15c5> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| CMP $0x270,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| JB 3780 <main+0x11e0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
