| Loop Id: 71 | Module: attention-aocc-znver5-256 | Source: attention_v2.cpp:163-163 [...] | Coverage: 0.41% |
|---|
| Loop Id: 71 | Module: attention-aocc-znver5-256 | Source: attention_v2.cpp:163-163 [...] | Coverage: 0.41% |
|---|
0x43e0 VMOVSS -0x3544(%RIP),%XMM1 |
0x43e8 VXORPS %XMM0,%XMM0,%XMM0 |
0x43ec MOV %RAX,%R9 |
0x43ef JMP 44ac |
(72) 0x4400 MOV 0x1710(%RSP),%RCX |
(72) 0x4408 MOV 0x398(%RSP),%RSI |
(72) 0x4410 MOV $-0x80000000,%RDX |
(72) 0x4417 XOR %R12D,%R12D |
(72) 0x441a AND %RDX,%RCX |
(72) 0x441d MOV %ESI,%EDX |
(72) 0x441f AND $0x7ffffffe,%EDX |
(72) 0x4425 OR %RCX,%RDX |
(72) 0x4428 MOV %ESI,%ECX |
(72) 0x442a AND $0x1,%ECX |
(72) 0x442d SHR $0x1,%RDX |
(72) 0x4430 XOR 0xff8(%RSP),%RDX |
(72) 0x4438 NEG %ECX |
(72) 0x443a AND %EDI,%ECX |
(72) 0x443c XOR %RDX,%RCX |
(72) 0x443f MOV %RCX,0x1710(%RSP) |
(72) 0x4447 MOV %R12,%RCX |
(72) 0x444a INC %R12 |
(72) 0x444d MOV $0x200b,%EDX |
(72) 0x4452 MOV %R12,0x1718(%RSP) |
(72) 0x445a MOV 0x398(%RSP,%RCX,8),%RCX |
(72) 0x4462 BEXTR %RDX,%RCX,%RDX |
(72) 0x4467 XOR %RCX,%RDX |
(72) 0x446a MOV %EDX,%ECX |
(72) 0x446c SAL $0x7,%ECX |
(72) 0x446f AND $-0x62d3a980,%ECX |
(72) 0x4475 XOR %RDX,%RCX |
(72) 0x4478 MOV %ECX,%EDX |
(72) 0x447a SAL $0xf,%EDX |
(72) 0x447d AND $-0x103a0000,%EDX |
(72) 0x4483 XOR %RCX,%RDX |
(72) 0x4486 MOV %RDX,%RCX |
(72) 0x4489 SHR $0x12,%RCX |
(72) 0x448d XOR %RDX,%RCX |
(72) 0x4490 DEC %R9 |
(72) 0x4493 VCVTUSI2SS %RCX,%XMM5,%XMM2 |
(72) 0x4499 VFMADD231SS %XMM2,%XMM1,%XMM0 |
(72) 0x449e VMULSS -0x3606(%RIP),%XMM1,%XMM1 |
(72) 0x44a6 JE 4660 |
(72) 0x44ac CMP $0x270,%R12 |
(72) 0x44b3 JB 4447 |
(72) 0x44b5 VPBROADCASTQ %RSI,%YMM2 |
(72) 0x44bb XOR %ECX,%ECX |
(72) 0x44bd NOPL (%RAX) |
(73) 0x44c0 VMOVDQA %YMM2,%YMM3 |
(73) 0x44c4 VMOVDQU 0x3a0(%RSP,%RCX,8),%YMM2 |
(73) 0x44cd VPANDQ -0x3617(%RIP){1to4},%YMM2,%YMM4 |
(73) 0x44d7 VPTESTMQ -0x3601(%RIP){1to0},%YMM2,%K1 |
(73) 0x44e1 VALIGNQ $0x3,%YMM3,%YMM2,%YMM3 |
(73) 0x44e8 VPTERNLOGQ $-0x8,-0x360b(%RIP){1to4},%YMM3,%YMM4 |
(73) 0x44f3 VPSRLQ $0x1,%YMM4,%YMM3 |
(73) 0x44f8 VPXOR 0x1000(%RSP,%RCX,8),%YMM3,%YMM3 |
(73) 0x4501 VPXORQ -0x361b(%RIP){1to4},%YMM3,%YMM3{%K1} |
(73) 0x450b VMOVDQU %YMM3,0x398(%RSP,%RCX,8) |
(73) 0x4514 ADD $0x4,%RCX |
(73) 0x4518 CMP $0xe0,%RCX |
(73) 0x451f JNE 44c0 |
(72) 0x4521 MOV 0xaa0(%RSP),%RDX |
(72) 0x4529 VEXTRACTI128 $0x1,%YMM2,%XMM2 |
(72) 0x452f MOV 0xaa8(%RSP),%RCX |
(72) 0x4537 MOV $-0x66f74f21,%R8D |
(72) 0x453d VPEXTRQ $0x1,%XMM2,%RSI |
(72) 0x4543 AND $-0x80000000,%RSI |
(72) 0x454a MOV %EDX,%EDI |
(72) 0x454c AND $0x7ffffffe,%EDI |
(72) 0x4552 OR %RSI,%RDI |
(72) 0x4555 MOV %EDX,%ESI |
(72) 0x4557 AND $0x1,%ESI |
(72) 0x455a AND $-0x80000000,%RDX |
(72) 0x4561 SHR $0x1,%RDI |
(72) 0x4564 XOR 0x1700(%RSP),%RDI |
(72) 0x456c NEG %ESI |
(72) 0x456e AND %R8D,%ESI |
(72) 0x4571 XOR %RDI,%RSI |
(72) 0x4574 MOV $-0x66f74f21,%EDI |
(72) 0x4579 MOV %RSI,0xa98(%RSP) |
(72) 0x4581 MOV %ECX,%ESI |
(72) 0x4583 AND $0x7ffffffe,%ESI |
(72) 0x4589 OR %RDX,%RSI |
(72) 0x458c MOV %ECX,%EDX |
(72) 0x458e AND $0x1,%EDX |
(72) 0x4591 AND $-0x80000000,%RCX |
(72) 0x4598 SHR $0x1,%RSI |
(72) 0x459b XOR 0x1708(%RSP),%RSI |
(72) 0x45a3 NEG %EDX |
(72) 0x45a5 AND %R8D,%EDX |
(72) 0x45a8 XOR %RSI,%RDX |
(72) 0x45ab MOV %RDX,0xaa0(%RSP) |
(72) 0x45b3 MOV 0xab0(%RSP),%RDX |
(72) 0x45bb MOV %EDX,%ESI |
(72) 0x45bd VPBROADCASTQ %RDX,%XMM2 |
(72) 0x45c3 AND $0x7ffffffe,%EDX |
(72) 0x45c9 AND $0x1,%ESI |
(72) 0x45cc OR %RCX,%RDX |
(72) 0x45cf NEG %ESI |
(72) 0x45d1 MOV $0xe4,%ECX |
(72) 0x45d6 SHR $0x1,%RDX |
(72) 0x45d9 XOR 0x1710(%RSP),%RDX |
(72) 0x45e1 AND %R8D,%ESI |
(72) 0x45e4 XOR %RDX,%RSI |
(72) 0x45e7 MOV %RSI,0xaa8(%RSP) |
(72) 0x45ef NOP |
(74) 0x45f0 VMOVDQU 0x398(%RSP,%RCX,8),%XMM3 |
(74) 0x45f9 VPANDQ -0x3743(%RIP){1to2},%XMM3,%XMM4 |
(74) 0x4603 VPTESTMQ -0x372d(%RIP){1to0},%XMM3,%K1 |
(74) 0x460d VPALIGNR $0x8,%XMM2,%XMM3,%XMM2 |
(74) 0x4613 VPTERNLOGQ $-0x8,-0x3736(%RIP){1to2},%XMM2,%XMM4 |
(74) 0x461e VPSRLQ $0x1,%XMM4,%XMM2 |
(74) 0x4623 VPXOR -0x388(%RSP,%RCX,8),%XMM2,%XMM2 |
(74) 0x462c VPXORQ -0x3746(%RIP){1to2},%XMM2,%XMM2{%K1} |
(74) 0x4636 VMOVDQU %XMM2,0x390(%RSP,%RCX,8) |
(74) 0x463f ADD $0x2,%RCX |
(74) 0x4643 VMOVDQA %XMM3,%XMM2 |
(74) 0x4647 CMP $0x270,%RCX |
(74) 0x464e JNE 45f0 |
(72) 0x4650 JMP 4400 |
0x4660 VDIVSS %XMM1,%XMM0,%XMM0 |
0x4664 VUCOMISS -0x37c8(%RIP),%XMM0 |
0x466c JAE 4684 |
0x466e VMOVSS %XMM0,(%RBX,%R10,4) |
0x4674 INC %R10 |
0x4677 CMP 0x70(%RSP),%R10 |
0x467c JNE 43e0 |
0x4684 VMOVSS -0x37e8(%RIP),%XMM0 |
0x468c VXORPS %XMM1,%XMM1,%XMM1 |
0x4690 MOV %RSI,0x20(%RSP) |
0x4695 MOV %RAX,0xe0(%RSP) |
0x469d MOV %R10,0x120(%RSP) |
0x46a5 VZEROUPPER |
0x46a8 CALL 70a0 <@plt_start@+0xf0> |
0x46ad MOV 0x120(%RSP),%R10 |
0x46b5 MOV 0xe0(%RSP),%RAX |
0x46bd MOV 0x20(%RSP),%RSI |
0x46c2 JMP 466e |
/usr/lib/gcc/x86_64-redhat-linux/11/../../../../include/c++/11/cmath: 1661 - 1661 |
-------------------------------------------------------------------------------- |
1661: { return __builtin_nextafterf(__x, __y); } |
/usr/lib/gcc/x86_64-redhat-linux/11/../../../../include/c++/11/bits/random.tcc: 401 - 3370 |
-------------------------------------------------------------------------------- |
401: for (size_t __k = 0; __k < (__n - __m); ++__k) |
402: { |
403: _UIntType __y = ((_M_x[__k] & __upper_mask) |
404: | (_M_x[__k + 1] & __lower_mask)); |
405: _M_x[__k] = (_M_x[__k + __m] ^ (__y >> 1) |
406: ^ ((__y & 0x01) ? __a : 0)); |
[...] |
412: | (_M_x[__k + 1] & __lower_mask)); |
413: _M_x[__k] = (_M_x[__k + (__m - __n)] ^ (__y >> 1) |
414: ^ ((__y & 0x01) ? __a : 0)); |
415: } |
416: |
417: _UIntType __y = ((_M_x[__n - 1] & __upper_mask) |
418: | (_M_x[0] & __lower_mask)); |
419: _M_x[__n - 1] = (_M_x[__m - 1] ^ (__y >> 1) |
420: ^ ((__y & 0x01) ? __a : 0)); |
[...] |
455: if (_M_p >= state_size) |
456: _M_gen_rand(); |
457: |
458: // Calculate o(x(i)). |
459: result_type __z = _M_x[_M_p++]; |
460: __z ^= (__z >> __u) & __d; |
461: __z ^= (__z << __s) & __b; |
462: __z ^= (__z << __t) & __c; |
463: __z ^= (__z >> __l); |
[...] |
3364: for (size_t __k = __m; __k != 0; --__k) |
3365: { |
3366: __sum += _RealType(__urng() - __urng.min()) * __tmp; |
3367: __tmp *= __r; |
3368: } |
3369: __ret = __sum / __tmp; |
3370: if (__builtin_expect(__ret >= _RealType(1), 0)) |
/home/eoseret/llm-attention/attention_v2.cpp: 163 - 163 |
-------------------------------------------------------------------------------- |
163: for (size_t i = 0; i < elemsX; ++i) h_X[i] = dist(rng); |
| Coverage (%) | Name | Source Location | Module |
|---|
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.10 |
| CQA speedup if FP arith vectorized | 1.68 |
| CQA speedup if fully vectorized | 13.35 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.33 |
| Bottlenecks | |
| Function | main |
| Source | cmath:1661-1661,random.tcc:3369-3370,attention_v2.cpp:163-163 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 2.75 |
| CQA cycles if no scalar integer | 2.50 |
| CQA cycles if FP arith vectorized | 1.64 |
| CQA cycles if fully vectorized | 0.21 |
| Front-end cycles | 2.19 |
| P0 cycles | 0.50 |
| P1 cycles | 0.50 |
| P2 cycles | 0.50 |
| P3 cycles | 1.17 |
| P4 cycles | 1.17 |
| P5 cycles | 1.17 |
| P6 cycles | 1.88 |
| P7 cycles | 1.88 |
| P8 cycles | 1.88 |
| P9 cycles | 1.88 |
| P10 cycles | 0.50 |
| P11 cycles | 0.50 |
| P12 cycles | 0.50 |
| P13 cycles | 0.50 |
| P14 cycles | 1.00 |
| P15 cycles | 1.00 |
| DIV/SQRT cycles | 2.50 |
| Inter-iter dependencies cycles | 1 |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 16.50 |
| Nb uops | 17.50 |
| Nb loads | 5.00 |
| Nb stores | 2.50 |
| Nb stack references | 2.50 |
| FLOP/cycle | 0.36 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 1.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 16.00 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 30.00 |
| Bytes stored | 16.00 |
| Stride 0 | 2.00 |
| Stride 1 | 0.00 |
| Stride n | 0.00 |
| Stride unknown | 2.00 |
| Stride indirect | 0.00 |
| Vectorization ratio all | 19.05 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | NA |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | 0.00 |
| Vectorization ratio other | 46.67 |
| Vector-efficiency ratio all | 11.68 |
| Vector-efficiency ratio load | 7.50 |
| Vector-efficiency ratio store | 8.59 |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | NA |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | 6.25 |
| Vector-efficiency ratio other | 16.67 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.00 |
| CQA speedup if FP arith vectorized | 2.50 |
| CQA speedup if fully vectorized | 16.00 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.82 |
| Bottlenecks | P10, P11, |
| Function | main |
| Source | cmath:1661-1661,random.tcc:3369-3370,attention_v2.cpp:163-163 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 2.50 |
| CQA cycles if no scalar integer | 2.50 |
| CQA cycles if FP arith vectorized | 1.00 |
| CQA cycles if fully vectorized | 0.16 |
| Front-end cycles | 1.38 |
| P0 cycles | 0.33 |
| P1 cycles | 0.33 |
| P2 cycles | 0.33 |
| P3 cycles | 1.00 |
| P4 cycles | 1.00 |
| P5 cycles | 1.00 |
| P6 cycles | 1.00 |
| P7 cycles | 1.00 |
| P8 cycles | 1.00 |
| P9 cycles | 1.00 |
| P10 cycles | 0.50 |
| P11 cycles | 0.50 |
| P12 cycles | 0.50 |
| P13 cycles | 0.50 |
| P14 cycles | 1.00 |
| P15 cycles | 1.00 |
| DIV/SQRT cycles | 2.50 |
| Inter-iter dependencies cycles | 1 |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 11.00 |
| Nb uops | 11.00 |
| Nb loads | 3.00 |
| Nb stores | 1.00 |
| Nb stack references | 1.00 |
| FLOP/cycle | 0.40 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 1.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 8.00 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 16.00 |
| Bytes stored | 4.00 |
| Stride 0 | 2.00 |
| Stride 1 | 0.00 |
| Stride n | 0.00 |
| Stride unknown | 2.00 |
| Stride indirect | 0.00 |
| Vectorization ratio all | 16.67 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | NA |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | 0.00 |
| Vectorization ratio other | 33.33 |
| Vector-efficiency ratio all | 10.42 |
| Vector-efficiency ratio load | 6.25 |
| Vector-efficiency ratio store | 6.25 |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | NA |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | 6.25 |
| Vector-efficiency ratio other | 14.58 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.20 |
| CQA speedup if FP arith vectorized | 1.32 |
| CQA speedup if fully vectorized | 11.73 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.09 |
| Bottlenecks | micro-operation queue, |
| Function | main |
| Source | cmath:1661-1661,random.tcc:3369-3370,attention_v2.cpp:163-163 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 3.00 |
| CQA cycles if no scalar integer | 2.50 |
| CQA cycles if FP arith vectorized | 2.28 |
| CQA cycles if fully vectorized | 0.26 |
| Front-end cycles | 3.00 |
| P0 cycles | 0.67 |
| P1 cycles | 0.67 |
| P2 cycles | 0.67 |
| P3 cycles | 1.33 |
| P4 cycles | 1.33 |
| P5 cycles | 1.33 |
| P6 cycles | 2.75 |
| P7 cycles | 2.75 |
| P8 cycles | 2.75 |
| P9 cycles | 2.75 |
| P10 cycles | 0.50 |
| P11 cycles | 0.50 |
| P12 cycles | 0.50 |
| P13 cycles | 0.50 |
| P14 cycles | 1.00 |
| P15 cycles | 1.00 |
| DIV/SQRT cycles | 2.50 |
| Inter-iter dependencies cycles | 1 |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 22.00 |
| Nb uops | 24.00 |
| Nb loads | 7.00 |
| Nb stores | 4.00 |
| Nb stack references | 4.00 |
| FLOP/cycle | 0.33 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 1.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 24.00 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 44.00 |
| Bytes stored | 28.00 |
| Stride 0 | 2.00 |
| Stride 1 | 0.00 |
| Stride n | 0.00 |
| Stride unknown | 2.00 |
| Stride indirect | 0.00 |
| Vectorization ratio all | 21.43 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | NA |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | 0.00 |
| Vectorization ratio other | 60.00 |
| Vector-efficiency ratio all | 12.95 |
| Vector-efficiency ratio load | 8.75 |
| Vector-efficiency ratio store | 10.94 |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | NA |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | 6.25 |
| Vector-efficiency ratio other | 18.75 |
| Path / |
| Function | main |
| Source file and lines | attention_v2.cpp:163-163 |
| Module | attention-aocc-znver5-256 |
| nb instructions | 16.50 |
| nb uops | 17.50 |
| loop length | 86 |
| used x86 registers | 5.50 |
| used mmx registers | 0 |
| used xmm registers | 2 |
| used ymm registers | 0 |
| used zmm registers | 0 |
| nb stack references | 2.50 |
| micro-operation queue | 2.19 cycles |
| front end | 2.19 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 0.50 | 0.50 | 0.50 | 1.17 | 1.17 | 1.17 | 1.88 | 1.88 | 1.88 | 1.88 | 0.50 | 0.50 | 0.50 | 0.50 | 1.00 | 1.00 |
| cycles | 0.50 | 0.50 | 0.50 | 1.17 | 1.17 | 1.17 | 1.88 | 1.88 | 1.88 | 1.88 | 0.50 | 0.50 | 0.50 | 0.50 | 1.00 | 1.00 |
| Cycles executing div or sqrt instructions | 2.50 |
| Longest recurrence chain latency (RecMII) | 1.00 |
| Front-end | 2.19 |
| Dispatch | 1.88 |
| DIV/SQRT | 2.50 |
| Data deps. | 1.00 |
| Overall L1 | 2.75 |
| all | 7% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 25% |
| all | 24% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 0% |
| other | 58% |
| all | 19% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 0% |
| other | 46% |
| all | 13% |
| load | 12% |
| store | 12% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 15% |
| all | 10% |
| load | 6% |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 6% |
| other | 17% |
| all | 11% |
| load | 7% |
| store | 8% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 6% |
| other | 16% |
| Function | main |
| Source file and lines | attention_v2.cpp:163-163 |
| Module | attention-aocc-znver5-256 |
| nb instructions | 11 |
| nb uops | 11 |
| loop length | 54 |
| used x86 registers | 5 |
| used mmx registers | 0 |
| used xmm registers | 2 |
| used ymm registers | 0 |
| used zmm registers | 0 |
| nb stack references | 1 |
| micro-operation queue | 1.38 cycles |
| front end | 1.38 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 0.33 | 0.33 | 0.33 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 0.50 | 0.50 | 0.50 | 0.50 | 1.00 | 1.00 |
| cycles | 0.33 | 0.33 | 0.33 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 0.50 | 0.50 | 0.50 | 0.50 | 1.00 | 1.00 |
| Cycles executing div or sqrt instructions | 2.50 |
| Longest recurrence chain latency (RecMII) | 1.00 |
| Front-end | 1.38 |
| Dispatch | 1.00 |
| DIV/SQRT | 2.50 |
| Data deps. | 1.00 |
| Overall L1 | 2.50 |
| all | 0% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 0% |
| all | 20% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 0% |
| other | 50% |
| all | 16% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 0% |
| other | 33% |
| all | 12% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 12% |
| all | 10% |
| load | 6% |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 6% |
| other | 15% |
| all | 10% |
| load | 6% |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 6% |
| other | 14% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| VMOVSS -0x3544(%RIP),%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VXORPS %XMM0,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| MOV %RAX,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.13 | scal (12.5%) |
| JMP 44ac <main+0x89c> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| VDIVSS %XMM1,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 10 | 2.50 | scal (6.3%) |
| VUCOMISS -0x37c8(%RIP),%XMM0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0.50 | 7 | 0.50 | scal (6.3%) |
| JAE 4684 <main+0xa74> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-0.50 | N/A |
| VMOVSS %XMM0,(%RBX,%R10,4) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (6.3%) |
| INC %R10 | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| CMP 0x70(%RSP),%R10 | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| JNE 43e0 <main+0x7d0> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-0.50 | N/A |
| Function | main |
| Source file and lines | attention_v2.cpp:163-163 |
| Module | attention-aocc-znver5-256 |
| nb instructions | 22 |
| nb uops | 24 |
| loop length | 118 |
| used x86 registers | 6 |
| used mmx registers | 0 |
| used xmm registers | 2 |
| used ymm registers | 0 |
| used zmm registers | 0 |
| nb stack references | 4 |
| micro-operation queue | 3.00 cycles |
| front end | 3.00 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 0.67 | 0.67 | 0.67 | 1.33 | 1.33 | 1.33 | 2.75 | 2.75 | 2.75 | 2.75 | 0.50 | 0.50 | 0.50 | 0.50 | 1.00 | 1.00 |
| cycles | 0.67 | 0.67 | 0.67 | 1.33 | 1.33 | 1.33 | 2.75 | 2.75 | 2.75 | 2.75 | 0.50 | 0.50 | 0.50 | 0.50 | 1.00 | 1.00 |
| Cycles executing div or sqrt instructions | 2.50 |
| Longest recurrence chain latency (RecMII) | 1.00 |
| Front-end | 3.00 |
| Dispatch | 2.75 |
| DIV/SQRT | 2.50 |
| Data deps. | 1.00 |
| Overall L1 | 3.00 |
| all | 14% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 50% |
| all | 28% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 0% |
| other | 66% |
| all | 21% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 0% |
| other | 60% |
| all | 14% |
| load | 12% |
| store | 12% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 18% |
| all | 11% |
| load | 6% |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 6% |
| other | 18% |
| all | 12% |
| load | 8% |
| store | 10% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 6% |
| other | 18% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| VMOVSS -0x3544(%RIP),%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VXORPS %XMM0,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| MOV %RAX,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.13 | scal (12.5%) |
| JMP 44ac <main+0x89c> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| VDIVSS %XMM1,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 10 | 2.50 | scal (6.3%) |
| VUCOMISS -0x37c8(%RIP),%XMM0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0.50 | 7 | 0.50 | scal (6.3%) |
| JAE 4684 <main+0xa74> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-0.50 | N/A |
| VMOVSS %XMM0,(%RBX,%R10,4) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (6.3%) |
| INC %R10 | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| CMP 0x70(%RSP),%R10 | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| JNE 43e0 <main+0x7d0> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-0.50 | N/A |
| VMOVSS -0x37e8(%RIP),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VXORPS %XMM1,%XMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| MOV %RSI,0x20(%RSP) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| MOV %RAX,0xe0(%RSP) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| MOV %R10,0x120(%RSP) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| VZEROUPPER | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | vect (25.0%) |
| CALL 70a0 <@plt_start@+0xf0> | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | N/A |
| MOV 0x120(%RSP),%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | N/A |
| MOV 0xe0(%RSP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (12.5%) |
| MOV 0x20(%RSP),%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (12.5%) |
| JMP 466e <main+0xa5e> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
