| Function: std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, ... | Module: attention-gcc-skl512 | Source: random.tcc:397-425 [...] | Coverage (incl. loops): 0.18% | (excl. loops): 0.00% |
|---|
| Function: std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, ... | Module: attention-gcc-skl512 | Source: random.tcc:397-425 [...] | Coverage (incl. loops): 0.18% | (excl. loops): 0.00% |
|---|
/usr/include/c++/16.1.1/bits/random.tcc: 397 - 425 |
-------------------------------------------------------------------------------- |
397: mersenne_twister_engine<_UIntType, __w, __n, __m, __r, __a, __u, __d, |
[...] |
404: for (size_t __k = 0; __k < (__n - __m); ++__k) |
405: { |
406: _UIntType __y = ((_M_x[__k] & __upper_mask) |
407: | (_M_x[__k + 1] & __lower_mask)); |
408: _M_x[__k] = (_M_x[__k + __m] ^ (__y >> 1) |
409: ^ ((__y & 0x01) ? __a : 0)); |
410: } |
411: |
412: for (size_t __k = (__n - __m); __k < (__n - 1); ++__k) |
413: { |
414: _UIntType __y = ((_M_x[__k] & __upper_mask) |
415: | (_M_x[__k + 1] & __lower_mask)); |
416: _M_x[__k] = (_M_x[__k + (__m - __n)] ^ (__y >> 1) |
417: ^ ((__y & 0x01) ? __a : 0)); |
418: } |
419: |
420: _UIntType __y = ((_M_x[__n - 1] & __upper_mask) |
421: | (_M_x[0] & __lower_mask)); |
422: _M_x[__n - 1] = (_M_x[__m - 1] ^ (__y >> 1) |
423: ^ ((__y & 0x01) ? __a : 0)); |
424: _M_p = 0; |
425: } |
0x36c0 MOV $-0x66f74f21,%EAX |
0x36c5 LEA 0x700(%RDI),%RDX |
0x36cc VPBROADCASTQ %RAX,%ZMM8 |
0x36d2 MOV $0x1,%EAX |
0x36d7 VPBROADCASTQ %RAX,%ZMM9 |
0x36dd MOV $0x7fffffff,%EAX |
0x36e2 VMOVDQA64 %ZMM8,%ZMM2 |
0x36e8 VPBROADCASTQ %RAX,%ZMM7 |
0x36ee MOV $-0x80000000,%RAX |
0x36f5 VMOVDQA64 %ZMM9,%ZMM3 |
0x36fb VPBROADCASTQ %RAX,%ZMM6 |
0x3701 VMOVDQA64 %ZMM7,%ZMM4 |
0x3707 MOV %RDI,%RAX |
0x370a VMOVDQA64 %ZMM6,%ZMM5 |
(41) 0x3710 VPANDQ 0x8(%RAX),%ZMM4,%ZMM1 |
(41) 0x371a VMOVDQA64 %ZMM5,%ZMM0 |
(41) 0x3720 ADD $0x40,%RAX |
(41) 0x3724 VPTERNLOGQ $-0x14,-0x40(%RAX),%ZMM1,%ZMM0 |
(41) 0x372c VPSRLQ $0x1,%ZMM0,%ZMM1 |
(41) 0x3733 VPANDQ %ZMM3,%ZMM0,%ZMM0 |
(41) 0x3739 VPMULLQ %ZMM2,%ZMM0,%ZMM0 |
(41) 0x373f VPTERNLOGQ $-0x6a,0xc28(%RAX),%ZMM0,%ZMM1 |
(41) 0x374a VMOVDQU64 %ZMM1,-0x40(%RAX) |
(41) 0x3751 CMP %RAX,%RDX |
(41) 0x3754 JNE 3710 |
0x3756 VPANDQ 0x708(%RDI),%XMM7,%XMM1 |
0x3760 VMOVDQA %XMM6,%XMM0 |
0x3764 MOV 0x710(%RDI),%RAX |
0x376b VMOVDQA64 %ZMM6,%ZMM5 |
0x3771 MOV 0x718(%RDI),%RDX |
0x3778 VMOVDQA64 %ZMM7,%ZMM4 |
0x377e VMOVDQA64 %ZMM9,%ZMM3 |
0x3784 VMOVDQA64 %ZMM8,%ZMM2 |
0x378a VPTERNLOGQ $-0x14,0x700(%RDI),%XMM1,%XMM0 |
0x3792 AND $-0x80000000,%RAX |
0x3798 VPSRLQ $0x1,%XMM0,%XMM1 |
0x379d VPANDQ %XMM9,%XMM0,%XMM0 |
0x37a3 AND $0x7fffffff,%EDX |
0x37a9 VPMULLQ %XMM8,%XMM0,%XMM0 |
0x37af OR %RDX,%RAX |
0x37b2 MOV %RAX,%RDX |
0x37b5 AND $0x1,%EAX |
0x37b8 SHR $0x1,%RDX |
0x37bb NEG %RAX |
0x37be XOR 0x1378(%RDI),%RDX |
0x37c5 AND $-0x66f74f21,%EAX |
0x37ca XOR %RDX,%RAX |
0x37cd LEA 0x1358(%RDI),%RDX |
0x37d4 VPTERNLOGQ $-0x6a,0x1368(%RDI),%XMM0,%XMM1 |
0x37df MOV %RAX,0x710(%RDI) |
0x37e6 LEA 0x718(%RDI),%RAX |
0x37ed VMOVDQU %XMM1,0x700(%RDI) |
0x37f5 NOPL (%RAX) |
(42) 0x37f8 VPANDQ 0x8(%RAX),%ZMM4,%ZMM1 |
(42) 0x3802 VMOVDQA64 %ZMM5,%ZMM0 |
(42) 0x3808 ADD $0x40,%RAX |
(42) 0x380c VPTERNLOGQ $-0x14,-0x40(%RAX),%ZMM1,%ZMM0 |
(42) 0x3814 VPSRLQ $0x1,%ZMM0,%ZMM1 |
(42) 0x381b VPANDQ %ZMM3,%ZMM0,%ZMM0 |
(42) 0x3821 VPMULLQ %ZMM2,%ZMM0,%ZMM0 |
(42) 0x3827 VPTERNLOGQ $-0x6a,-0x758(%RAX),%ZMM0,%ZMM1 |
(42) 0x3832 VMOVDQU64 %ZMM1,-0x40(%RAX) |
(42) 0x3839 CMP %RAX,%RDX |
(42) 0x383c JNE 37f8 |
0x383e VPANDQ 0x1360(%RDI),%YMM7,%YMM0 |
0x3848 MOV 0x1378(%RDI),%RAX |
0x384f MOVQ $0,0x1380(%RDI) |
0x385a MOV (%RDI),%RDX |
0x385d VPTERNLOGQ $-0x14,0x1358(%RDI),%YMM0,%YMM6 |
0x3868 AND $-0x80000000,%RAX |
0x386e AND $0x7fffffff,%EDX |
0x3874 VPSRLQ $0x1,%YMM6,%YMM1 |
0x3879 VPANDQ %YMM9,%YMM6,%YMM0 |
0x387f OR %RDX,%RAX |
0x3882 VPMULLQ %YMM8,%YMM0,%YMM0 |
0x3888 MOV %RAX,%RDX |
0x388b AND $0x1,%EAX |
0x388e SHR $0x1,%RDX |
0x3891 NEG %RAX |
0x3894 XOR 0xc60(%RDI),%RDX |
0x389b AND $-0x66f74f21,%EAX |
0x38a0 XOR %RDX,%RAX |
0x38a3 MOV %RAX,0x1378(%RDI) |
0x38aa VPTERNLOGQ $-0x6a,0xc40(%RDI),%YMM0,%YMM1 |
0x38b2 VMOVDQU %YMM1,0x1358(%RDI) |
0x38ba VZEROUPPER |
0x38bd RET |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | __libc_init_first | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | new_allocator.h:183 | attention-gcc-skl512 |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
The code analyzed by CQA in that panel excludes loops and represents 0.00% of application time for run run_0
| Source file and lines | random.tcc:397-425 |
| Module | attention-gcc-skl512 |
| nb instructions | 65 |
| nb uops | 76 |
| loop length | 370 |
| used x86 registers | 3 |
| used mmx registers | 0 |
| used xmm registers | 6 |
| used ymm registers | 6 |
| used zmm registers | 8 |
| nb stack references | 0 |
| micro-operation queue | 19.00 cycles |
| front end | 19.00 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
|---|---|---|---|---|---|---|---|---|
| uops | 11.58 | 11.50 | 6.00 | 6.00 | 5.00 | 11.42 | 11.50 | 6.00 |
| cycles | 11.58 | 11.50 | 6.00 | 6.00 | 5.00 | 11.42 | 11.50 | 6.00 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 19.00 |
| Dispatch | 11.58 |
| Overall L1 | 19.00 |
| all | 40% |
| load | 50% |
| store | 40% |
| mul | 100% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 40% |
| all | 29% |
| load | 25% |
| store | 21% |
| mul | 37% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 31% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOV $-0x66f74f21,%EAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| LEA 0x700(%RDI),%RDX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| VPBROADCASTQ %RAX,%ZMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| MOV $0x1,%EAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| VPBROADCASTQ %RAX,%ZMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| MOV $0x7fffffff,%EAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| VMOVDQA64 %ZMM8,%ZMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (100.0%) |
| VPBROADCASTQ %RAX,%ZMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| MOV $-0x80000000,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| VMOVDQA64 %ZMM9,%ZMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (100.0%) |
| VPBROADCASTQ %RAX,%ZMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| VMOVDQA64 %ZMM7,%ZMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (100.0%) |
| MOV %RDI,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| VMOVDQA64 %ZMM6,%ZMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (100.0%) |
| VPANDQ 0x708(%RDI),%XMM7,%XMM1 | 1 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0.33 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VMOVDQA %XMM6,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| MOV 0x710(%RDI),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| VMOVDQA64 %ZMM6,%ZMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (100.0%) |
| MOV 0x718(%RDI),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| VMOVDQA64 %ZMM7,%ZMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (100.0%) |
| VMOVDQA64 %ZMM9,%ZMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (100.0%) |
| VMOVDQA64 %ZMM8,%ZMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (100.0%) |
| VPTERNLOGQ $-0x14,0x700(%RDI),%XMM1,%XMM0 | 2 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0.33 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| AND $-0x80000000,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| VPSRLQ $0x1,%XMM0,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VPANDQ %XMM9,%XMM0,%XMM0 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 | vect (25.0%) |
| AND $0x7fffffff,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| VPMULLQ %XMM8,%XMM0,%XMM0 | 3 | 1.50 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 | vect (25.0%) |
| OR %RDX,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RAX,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| AND $0x1,%EAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| SHR $0x1,%RDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
| NEG %RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| XOR 0x1378(%RDI),%RDX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| AND $-0x66f74f21,%EAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| XOR %RDX,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| LEA 0x1358(%RDI),%RDX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| VPTERNLOGQ $-0x6a,0x1368(%RDI),%XMM0,%XMM1 | 2 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0.33 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| MOV %RAX,0x710(%RDI) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| LEA 0x718(%RDI),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| VMOVDQU %XMM1,0x700(%RDI) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 4 | 1 | vect (25.0%) |
| NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VPANDQ 0x1360(%RDI),%YMM7,%YMM0 | 1 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0.33 | 0 | 0 | 1 | 0.50 | vect (50.0%) |
| MOV 0x1378(%RDI),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| MOVQ $0,0x1380(%RDI) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 2 | 1 | scal (6.3%) |
| MOV (%RDI),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| VPTERNLOGQ $-0x14,0x1358(%RDI),%YMM0,%YMM6 | 2 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0.33 | 0 | 0 | 1 | 0.50 | vect (50.0%) |
| AND $-0x80000000,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| AND $0x7fffffff,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| VPSRLQ $0x1,%YMM6,%YMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (50.0%) |
| VPANDQ %YMM9,%YMM6,%YMM0 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 | vect (50.0%) |
| OR %RDX,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| VPMULLQ %YMM8,%YMM0,%YMM0 | 3 | 1.50 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 | vect (50.0%) |
| MOV %RAX,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| AND $0x1,%EAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| SHR $0x1,%RDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
| NEG %RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| XOR 0xc60(%RDI),%RDX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| AND $-0x66f74f21,%EAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| XOR %RDX,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RAX,0x1378(%RDI) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| VPTERNLOGQ $-0x6a,0xc40(%RDI),%YMM0,%YMM1 | 2 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0.33 | 0 | 0 | 1 | 0.50 | vect (50.0%) |
| VMOVDQU %YMM1,0x1358(%RDI) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 4 | 1 | vect (50.0%) |
| VZEROUPPER | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | vect (25.0%) |
| RET | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 | 0 | 1 | N/A |
The code analyzed by CQA in that panel excludes loops and represents 0.00% of application time for run run_0
| Source file and lines | random.tcc:397-425 |
| Module | attention-gcc-skl512 |
| nb instructions | 65 |
| nb uops | 76 |
| loop length | 370 |
| used x86 registers | 3 |
| used mmx registers | 0 |
| used xmm registers | 6 |
| used ymm registers | 6 |
| used zmm registers | 8 |
| nb stack references | 0 |
| micro-operation queue | 19.00 cycles |
| front end | 19.00 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
|---|---|---|---|---|---|---|---|---|
| uops | 11.58 | 11.50 | 6.00 | 6.00 | 5.00 | 11.42 | 11.50 | 6.00 |
| cycles | 11.58 | 11.50 | 6.00 | 6.00 | 5.00 | 11.42 | 11.50 | 6.00 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 19.00 |
| Dispatch | 11.58 |
| Overall L1 | 19.00 |
| all | 40% |
| load | 50% |
| store | 40% |
| mul | 100% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 40% |
| all | 29% |
| load | 25% |
| store | 21% |
| mul | 37% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 31% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOV $-0x66f74f21,%EAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| LEA 0x700(%RDI),%RDX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| VPBROADCASTQ %RAX,%ZMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| MOV $0x1,%EAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| VPBROADCASTQ %RAX,%ZMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| MOV $0x7fffffff,%EAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| VMOVDQA64 %ZMM8,%ZMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (100.0%) |
| VPBROADCASTQ %RAX,%ZMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| MOV $-0x80000000,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| VMOVDQA64 %ZMM9,%ZMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (100.0%) |
| VPBROADCASTQ %RAX,%ZMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| VMOVDQA64 %ZMM7,%ZMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (100.0%) |
| MOV %RDI,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| VMOVDQA64 %ZMM6,%ZMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (100.0%) |
| VPANDQ 0x708(%RDI),%XMM7,%XMM1 | 1 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0.33 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VMOVDQA %XMM6,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (25.0%) |
| MOV 0x710(%RDI),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| VMOVDQA64 %ZMM6,%ZMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (100.0%) |
| MOV 0x718(%RDI),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| VMOVDQA64 %ZMM7,%ZMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (100.0%) |
| VMOVDQA64 %ZMM9,%ZMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (100.0%) |
| VMOVDQA64 %ZMM8,%ZMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | vect (100.0%) |
| VPTERNLOGQ $-0x14,0x700(%RDI),%XMM1,%XMM0 | 2 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0.33 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| AND $-0x80000000,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| VPSRLQ $0x1,%XMM0,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| VPANDQ %XMM9,%XMM0,%XMM0 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 | vect (25.0%) |
| AND $0x7fffffff,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| VPMULLQ %XMM8,%XMM0,%XMM0 | 3 | 1.50 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 | vect (25.0%) |
| OR %RDX,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RAX,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| AND $0x1,%EAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| SHR $0x1,%RDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
| NEG %RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| XOR 0x1378(%RDI),%RDX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| AND $-0x66f74f21,%EAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| XOR %RDX,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| LEA 0x1358(%RDI),%RDX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| VPTERNLOGQ $-0x6a,0x1368(%RDI),%XMM0,%XMM1 | 2 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0.33 | 0 | 0 | 1 | 0.50 | vect (25.0%) |
| MOV %RAX,0x710(%RDI) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| LEA 0x718(%RDI),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| VMOVDQU %XMM1,0x700(%RDI) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 4 | 1 | vect (25.0%) |
| NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VPANDQ 0x1360(%RDI),%YMM7,%YMM0 | 1 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0.33 | 0 | 0 | 1 | 0.50 | vect (50.0%) |
| MOV 0x1378(%RDI),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| MOVQ $0,0x1380(%RDI) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 2 | 1 | scal (6.3%) |
| MOV (%RDI),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| VPTERNLOGQ $-0x14,0x1358(%RDI),%YMM0,%YMM6 | 2 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0.33 | 0 | 0 | 1 | 0.50 | vect (50.0%) |
| AND $-0x80000000,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| AND $0x7fffffff,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| VPSRLQ $0x1,%YMM6,%YMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | vect (50.0%) |
| VPANDQ %YMM9,%YMM6,%YMM0 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 | vect (50.0%) |
| OR %RDX,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| VPMULLQ %YMM8,%YMM0,%YMM0 | 3 | 1.50 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 | vect (50.0%) |
| MOV %RAX,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| AND $0x1,%EAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| SHR $0x1,%RDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
| NEG %RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| XOR 0xc60(%RDI),%RDX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| AND $-0x66f74f21,%EAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| XOR %RDX,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RAX,0x1378(%RDI) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| VPTERNLOGQ $-0x6a,0xc40(%RDI),%YMM0,%YMM1 | 2 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0.33 | 0 | 0 | 1 | 0.50 | vect (50.0%) |
| VMOVDQU %YMM1,0x1358(%RDI) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 4 | 1 | vect (50.0%) |
| VZEROUPPER | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | vect (25.0%) |
| RET | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 | 0 | 1 | N/A |
| Name | Coverage (%) | Time (s) |
|---|---|---|
| ▼std::mersenne_twister_engine | 0.18 | 0.05 |
| ○Loop 42 - random.tcc:412-417 - attention-gcc-skl512 | 0.14 | 0.04 |
| ○Loop 41 - random.tcc:404-409 - attention-gcc-skl512 | 0.04 | 0.01 |
