| Loop Id: 79 | Module: attention-aocc-znver5-512 | Source: attention_v2.cpp:163-163 [...] | Coverage: 0.25% |
|---|
| Loop Id: 79 | Module: attention-aocc-znver5-512 | Source: attention_v2.cpp:163-163 [...] | Coverage: 0.25% |
|---|
0x5180 VMOVSS -0x42e4(%RIP),%XMM1 |
0x5188 VXORPS %XMM0,%XMM0,%XMM0 |
0x518c MOV %RAX,%R9 |
0x518f JMP 5205 |
(80) 0x51a0 MOV %R12,%RCX |
(80) 0x51a3 INC %R12 |
(80) 0x51a6 MOV $0x200b,%EDX |
(80) 0x51ab MOV %R12,0x18f8(%RSP) |
(80) 0x51b3 MOV 0x578(%RSP,%RCX,8),%RCX |
(80) 0x51bb BEXTR %RDX,%RCX,%RDX |
(80) 0x51c0 XOR %RCX,%RDX |
(80) 0x51c3 MOV %EDX,%ECX |
(80) 0x51c5 SAL $0x7,%ECX |
(80) 0x51c8 AND $-0x62d3a980,%ECX |
(80) 0x51ce XOR %RDX,%RCX |
(80) 0x51d1 MOV %ECX,%EDX |
(80) 0x51d3 SAL $0xf,%EDX |
(80) 0x51d6 AND $-0x103a0000,%EDX |
(80) 0x51dc XOR %RCX,%RDX |
(80) 0x51df MOV %RDX,%RCX |
(80) 0x51e2 SHR $0x12,%RCX |
(80) 0x51e6 XOR %RDX,%RCX |
(80) 0x51e9 DEC %R9 |
(80) 0x51ec VCVTUSI2SS %RCX,%XMM5,%XMM2 |
(80) 0x51f2 VFMADD231SS %XMM2,%XMM1,%XMM0 |
(80) 0x51f7 VMULSS -0x435f(%RIP),%XMM1,%XMM1 |
(80) 0x51ff JE 5410 |
(80) 0x5205 CMP $0x270,%R12 |
(80) 0x520c JB 51a0 |
(80) 0x520e VPBROADCASTQ %RSI,%ZMM2 |
(80) 0x5214 XOR %ECX,%ECX |
(80) 0x5216 NOPW %CS:(%RAX,%RAX,1) |
(81) 0x5220 VMOVDQA64 %ZMM2,%ZMM3 |
(81) 0x5226 VMOVDQU64 0x580(%RSP,%RCX,8),%ZMM2 |
(81) 0x522e VPANDQ -0x4378(%RIP){1to8},%ZMM2,%ZMM4 |
(81) 0x5238 VPTESTMQ -0x4362(%RIP){1to0},%ZMM2,%K1 |
(81) 0x5242 VALIGNQ $0x7,%ZMM3,%ZMM2,%ZMM3 |
(81) 0x5249 VPTERNLOGQ $-0x8,-0x436c(%RIP){1to8},%ZMM3,%ZMM4 |
(81) 0x5254 VPSRLQ $0x1,%ZMM4,%ZMM3 |
(81) 0x525b VPXORQ 0x11e0(%RSP,%RCX,8),%ZMM3,%ZMM3 |
(81) 0x5266 VPXORQ -0x4380(%RIP){1to8},%ZMM3,%ZMM3{%K1} |
(81) 0x5270 VMOVDQU64 %ZMM3,0x578(%RSP,%RCX,8) |
(81) 0x527b ADD $0x8,%RCX |
(81) 0x527f CMP $0xe0,%RCX |
(81) 0x5286 JNE 5220 |
(80) 0x5288 MOV 0xc80(%RSP),%RDX |
(80) 0x5290 VEXTRACTI32X4 $0x3,%ZMM2,%XMM2 |
(80) 0x5297 MOV 0xc88(%RSP),%RCX |
(80) 0x529f MOV $-0x66f74f21,%R8D |
(80) 0x52a5 VPEXTRQ $0x1,%XMM2,%RSI |
(80) 0x52ab AND $-0x80000000,%RSI |
(80) 0x52b2 MOV %EDX,%EDI |
(80) 0x52b4 AND $0x7ffffffe,%EDI |
(80) 0x52ba OR %RSI,%RDI |
(80) 0x52bd MOV %EDX,%ESI |
(80) 0x52bf AND $0x1,%ESI |
(80) 0x52c2 AND $-0x80000000,%RDX |
(80) 0x52c9 SHR $0x1,%RDI |
(80) 0x52cc XOR 0x18e0(%RSP),%RDI |
(80) 0x52d4 NEG %ESI |
(80) 0x52d6 AND %R8D,%ESI |
(80) 0x52d9 XOR %RDI,%RSI |
(80) 0x52dc MOV $-0x66f74f21,%EDI |
(80) 0x52e1 MOV %RSI,0xc78(%RSP) |
(80) 0x52e9 MOV %ECX,%ESI |
(80) 0x52eb AND $0x7ffffffe,%ESI |
(80) 0x52f1 OR %RDX,%RSI |
(80) 0x52f4 MOV %ECX,%EDX |
(80) 0x52f6 AND $0x1,%EDX |
(80) 0x52f9 AND $-0x80000000,%RCX |
(80) 0x5300 SHR $0x1,%RSI |
(80) 0x5303 XOR 0x18e8(%RSP),%RSI |
(80) 0x530b NEG %EDX |
(80) 0x530d AND %R8D,%EDX |
(80) 0x5310 XOR %RSI,%RDX |
(80) 0x5313 MOV %RDX,0xc80(%RSP) |
(80) 0x531b MOV 0xc90(%RSP),%RDX |
(80) 0x5323 MOV %EDX,%ESI |
(80) 0x5325 VPBROADCASTQ %RDX,%XMM2 |
(80) 0x532b AND $0x7ffffffe,%EDX |
(80) 0x5331 AND $0x1,%ESI |
(80) 0x5334 OR %RCX,%RDX |
(80) 0x5337 NEG %ESI |
(80) 0x5339 MOV $0xe4,%ECX |
(80) 0x533e SHR $0x1,%RDX |
(80) 0x5341 XOR 0x18f0(%RSP),%RDX |
(80) 0x5349 AND %R8D,%ESI |
(80) 0x534c XOR %RDX,%RSI |
(80) 0x534f MOV %RSI,0xc88(%RSP) |
(80) 0x5357 NOPW (%RAX,%RAX,1) |
(82) 0x5360 VMOVDQU 0x578(%RSP,%RCX,8),%XMM3 |
(82) 0x5369 VPANDQ -0x44b3(%RIP){1to2},%XMM3,%XMM4 |
(82) 0x5373 VPTESTMQ -0x449d(%RIP){1to0},%XMM3,%K1 |
(82) 0x537d VPALIGNR $0x8,%XMM2,%XMM3,%XMM2 |
(82) 0x5383 VPTERNLOGQ $-0x8,-0x44a6(%RIP){1to2},%XMM2,%XMM4 |
(82) 0x538e VPSRLQ $0x1,%XMM4,%XMM2 |
(82) 0x5393 VPXOR -0x1a8(%RSP,%RCX,8),%XMM2,%XMM2 |
(82) 0x539c VPXORQ -0x44b6(%RIP){1to2},%XMM2,%XMM2{%K1} |
(82) 0x53a6 VMOVDQU %XMM2,0x570(%RSP,%RCX,8) |
(82) 0x53af ADD $0x2,%RCX |
(82) 0x53b3 VMOVDQA %XMM3,%XMM2 |
(82) 0x53b7 CMP $0x270,%RCX |
(82) 0x53be JNE 5360 |
(80) 0x53c0 MOV 0x18f0(%RSP),%RCX |
(80) 0x53c8 MOV 0x578(%RSP),%RSI |
(80) 0x53d0 MOV $-0x80000000,%RDX |
(80) 0x53d7 XOR %R12D,%R12D |
(80) 0x53da AND %RDX,%RCX |
(80) 0x53dd MOV %ESI,%EDX |
(80) 0x53df AND $0x7ffffffe,%EDX |
(80) 0x53e5 OR %RCX,%RDX |
(80) 0x53e8 MOV %ESI,%ECX |
(80) 0x53ea AND $0x1,%ECX |
(80) 0x53ed SHR $0x1,%RDX |
(80) 0x53f0 XOR 0x11d8(%RSP),%RDX |
(80) 0x53f8 NEG %ECX |
(80) 0x53fa AND %EDI,%ECX |
(80) 0x53fc XOR %RDX,%RCX |
(80) 0x53ff MOV %RCX,0x18f0(%RSP) |
(80) 0x5407 JMP 51a0 |
0x5410 VDIVSS %XMM1,%XMM0,%XMM0 |
0x5414 VUCOMISS -0x4578(%RIP),%XMM0 |
0x541c JAE 5437 |
0x541e VMOVSS %XMM0,(%RBX,%R10,4) |
0x5424 INC %R10 |
0x5427 CMP 0x88(%RSP),%R10 |
0x542f JNE 5180 |
0x5437 VMOVSS -0x459b(%RIP),%XMM0 |
0x543f VXORPS %XMM1,%XMM1,%XMM1 |
0x5443 MOV %RSI,0x18(%RSP) |
0x5448 MOV %RAX,0x240(%RSP) |
0x5450 MOV %R10,0x1c0(%RSP) |
0x5458 VZEROUPPER |
0x545b CALL 8900 <@plt_start@+0xf0> |
0x5460 MOV 0x1c0(%RSP),%R10 |
0x5468 MOV 0x240(%RSP),%RAX |
0x5470 MOV 0x18(%RSP),%RSI |
0x5475 JMP 541e |
/usr/lib/gcc/x86_64-redhat-linux/11/../../../../include/c++/11/cmath: 1661 - 1661 |
-------------------------------------------------------------------------------- |
1661: { return __builtin_nextafterf(__x, __y); } |
/usr/lib/gcc/x86_64-redhat-linux/11/../../../../include/c++/11/bits/random.tcc: 401 - 3370 |
-------------------------------------------------------------------------------- |
401: for (size_t __k = 0; __k < (__n - __m); ++__k) |
402: { |
403: _UIntType __y = ((_M_x[__k] & __upper_mask) |
404: | (_M_x[__k + 1] & __lower_mask)); |
405: _M_x[__k] = (_M_x[__k + __m] ^ (__y >> 1) |
406: ^ ((__y & 0x01) ? __a : 0)); |
[...] |
412: | (_M_x[__k + 1] & __lower_mask)); |
413: _M_x[__k] = (_M_x[__k + (__m - __n)] ^ (__y >> 1) |
414: ^ ((__y & 0x01) ? __a : 0)); |
415: } |
416: |
417: _UIntType __y = ((_M_x[__n - 1] & __upper_mask) |
418: | (_M_x[0] & __lower_mask)); |
419: _M_x[__n - 1] = (_M_x[__m - 1] ^ (__y >> 1) |
420: ^ ((__y & 0x01) ? __a : 0)); |
[...] |
455: if (_M_p >= state_size) |
456: _M_gen_rand(); |
457: |
458: // Calculate o(x(i)). |
459: result_type __z = _M_x[_M_p++]; |
460: __z ^= (__z >> __u) & __d; |
461: __z ^= (__z << __s) & __b; |
462: __z ^= (__z << __t) & __c; |
463: __z ^= (__z >> __l); |
[...] |
3364: for (size_t __k = __m; __k != 0; --__k) |
3365: { |
3366: __sum += _RealType(__urng() - __urng.min()) * __tmp; |
3367: __tmp *= __r; |
3368: } |
3369: __ret = __sum / __tmp; |
3370: if (__builtin_expect(__ret >= _RealType(1), 0)) |
/home/eoseret/llm-attention/attention_v2.cpp: 163 - 163 |
-------------------------------------------------------------------------------- |
163: for (size_t i = 0; i < elemsX; ++i) h_X[i] = dist(rng); |
| Coverage (%) | Name | Source Location | Module |
|---|
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.10 |
| CQA speedup if FP arith vectorized | 1.68 |
| CQA speedup if fully vectorized | 13.35 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.33 |
| Bottlenecks | |
| Function | main |
| Source | cmath:1661-1661,random.tcc:3369-3370,attention_v2.cpp:163-163 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 2.75 |
| CQA cycles if no scalar integer | 2.50 |
| CQA cycles if FP arith vectorized | 1.64 |
| CQA cycles if fully vectorized | 0.21 |
| Front-end cycles | 2.19 |
| P0 cycles | 0.50 |
| P1 cycles | 0.50 |
| P2 cycles | 0.50 |
| P3 cycles | 1.17 |
| P4 cycles | 1.17 |
| P5 cycles | 1.17 |
| P6 cycles | 1.88 |
| P7 cycles | 1.88 |
| P8 cycles | 1.88 |
| P9 cycles | 1.88 |
| P10 cycles | 0.50 |
| P11 cycles | 0.50 |
| P12 cycles | 0.50 |
| P13 cycles | 0.50 |
| P14 cycles | 1.00 |
| P15 cycles | 1.00 |
| DIV/SQRT cycles | 2.50 |
| Inter-iter dependencies cycles | 1 |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 16.50 |
| Nb uops | 17.50 |
| Nb loads | 5.00 |
| Nb stores | 2.50 |
| Nb stack references | 2.50 |
| FLOP/cycle | 0.36 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 1.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 16.00 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 30.00 |
| Bytes stored | 16.00 |
| Stride 0 | 2.00 |
| Stride 1 | 0.00 |
| Stride n | 0.00 |
| Stride unknown | 2.00 |
| Stride indirect | 0.00 |
| Vectorization ratio all | 19.05 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | NA |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | 0.00 |
| Vectorization ratio other | 46.67 |
| Vector-efficiency ratio all | 11.68 |
| Vector-efficiency ratio load | 7.50 |
| Vector-efficiency ratio store | 8.59 |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | NA |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | 6.25 |
| Vector-efficiency ratio other | 16.67 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.00 |
| CQA speedup if FP arith vectorized | 2.50 |
| CQA speedup if fully vectorized | 16.00 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.82 |
| Bottlenecks | P10, P11, |
| Function | main |
| Source | cmath:1661-1661,random.tcc:3369-3370,attention_v2.cpp:163-163 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 2.50 |
| CQA cycles if no scalar integer | 2.50 |
| CQA cycles if FP arith vectorized | 1.00 |
| CQA cycles if fully vectorized | 0.16 |
| Front-end cycles | 1.38 |
| P0 cycles | 0.33 |
| P1 cycles | 0.33 |
| P2 cycles | 0.33 |
| P3 cycles | 1.00 |
| P4 cycles | 1.00 |
| P5 cycles | 1.00 |
| P6 cycles | 1.00 |
| P7 cycles | 1.00 |
| P8 cycles | 1.00 |
| P9 cycles | 1.00 |
| P10 cycles | 0.50 |
| P11 cycles | 0.50 |
| P12 cycles | 0.50 |
| P13 cycles | 0.50 |
| P14 cycles | 1.00 |
| P15 cycles | 1.00 |
| DIV/SQRT cycles | 2.50 |
| Inter-iter dependencies cycles | 1 |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 11.00 |
| Nb uops | 11.00 |
| Nb loads | 3.00 |
| Nb stores | 1.00 |
| Nb stack references | 1.00 |
| FLOP/cycle | 0.40 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 1.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 8.00 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 16.00 |
| Bytes stored | 4.00 |
| Stride 0 | 2.00 |
| Stride 1 | 0.00 |
| Stride n | 0.00 |
| Stride unknown | 2.00 |
| Stride indirect | 0.00 |
| Vectorization ratio all | 16.67 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | NA |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | 0.00 |
| Vectorization ratio other | 33.33 |
| Vector-efficiency ratio all | 10.42 |
| Vector-efficiency ratio load | 6.25 |
| Vector-efficiency ratio store | 6.25 |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | NA |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | 6.25 |
| Vector-efficiency ratio other | 14.58 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.20 |
| CQA speedup if FP arith vectorized | 1.32 |
| CQA speedup if fully vectorized | 11.73 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.09 |
| Bottlenecks | micro-operation queue, |
| Function | main |
| Source | cmath:1661-1661,random.tcc:3369-3370,attention_v2.cpp:163-163 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 3.00 |
| CQA cycles if no scalar integer | 2.50 |
| CQA cycles if FP arith vectorized | 2.28 |
| CQA cycles if fully vectorized | 0.26 |
| Front-end cycles | 3.00 |
| P0 cycles | 0.67 |
| P1 cycles | 0.67 |
| P2 cycles | 0.67 |
| P3 cycles | 1.33 |
| P4 cycles | 1.33 |
| P5 cycles | 1.33 |
| P6 cycles | 2.75 |
| P7 cycles | 2.75 |
| P8 cycles | 2.75 |
| P9 cycles | 2.75 |
| P10 cycles | 0.50 |
| P11 cycles | 0.50 |
| P12 cycles | 0.50 |
| P13 cycles | 0.50 |
| P14 cycles | 1.00 |
| P15 cycles | 1.00 |
| DIV/SQRT cycles | 2.50 |
| Inter-iter dependencies cycles | 1 |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 22.00 |
| Nb uops | 24.00 |
| Nb loads | 7.00 |
| Nb stores | 4.00 |
| Nb stack references | 4.00 |
| FLOP/cycle | 0.33 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 1.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 24.00 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 44.00 |
| Bytes stored | 28.00 |
| Stride 0 | 2.00 |
| Stride 1 | 0.00 |
| Stride n | 0.00 |
| Stride unknown | 2.00 |
| Stride indirect | 0.00 |
| Vectorization ratio all | 21.43 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | NA |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | 0.00 |
| Vectorization ratio other | 60.00 |
| Vector-efficiency ratio all | 12.95 |
| Vector-efficiency ratio load | 8.75 |
| Vector-efficiency ratio store | 10.94 |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | NA |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | 6.25 |
| Vector-efficiency ratio other | 18.75 |
| Path / |
| Function | main |
| Source file and lines | attention_v2.cpp:163-163 |
| Module | attention-aocc-znver5-512 |
| nb instructions | 16.50 |
| nb uops | 17.50 |
| loop length | 86 |
| used x86 registers | 5.50 |
| used mmx registers | 0 |
| used xmm registers | 2 |
| used ymm registers | 0 |
| used zmm registers | 0 |
| nb stack references | 2.50 |
| micro-operation queue | 2.19 cycles |
| front end | 2.19 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 0.50 | 0.50 | 0.50 | 1.17 | 1.17 | 1.17 | 1.88 | 1.88 | 1.88 | 1.88 | 0.50 | 0.50 | 0.50 | 0.50 | 1.00 | 1.00 |
| cycles | 0.50 | 0.50 | 0.50 | 1.17 | 1.17 | 1.17 | 1.88 | 1.88 | 1.88 | 1.88 | 0.50 | 0.50 | 0.50 | 0.50 | 1.00 | 1.00 |
| Cycles executing div or sqrt instructions | 2.50 |
| Longest recurrence chain latency (RecMII) | 1.00 |
| Front-end | 2.19 |
| Dispatch | 1.88 |
| DIV/SQRT | 2.50 |
| Data deps. | 1.00 |
| Overall L1 | 2.75 |
| all | 7% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 25% |
| all | 24% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 0% |
| other | 58% |
| all | 19% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 0% |
| other | 46% |
| all | 13% |
| load | 12% |
| store | 12% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 15% |
| all | 10% |
| load | 6% |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 6% |
| other | 17% |
| all | 11% |
| load | 7% |
| store | 8% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 6% |
| other | 16% |
| Function | main |
| Source file and lines | attention_v2.cpp:163-163 |
| Module | attention-aocc-znver5-512 |
| nb instructions | 11 |
| nb uops | 11 |
| loop length | 54 |
| used x86 registers | 5 |
| used mmx registers | 0 |
| used xmm registers | 2 |
| used ymm registers | 0 |
| used zmm registers | 0 |
| nb stack references | 1 |
| micro-operation queue | 1.38 cycles |
| front end | 1.38 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 0.33 | 0.33 | 0.33 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 0.50 | 0.50 | 0.50 | 0.50 | 1.00 | 1.00 |
| cycles | 0.33 | 0.33 | 0.33 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 0.50 | 0.50 | 0.50 | 0.50 | 1.00 | 1.00 |
| Cycles executing div or sqrt instructions | 2.50 |
| Longest recurrence chain latency (RecMII) | 1.00 |
| Front-end | 1.38 |
| Dispatch | 1.00 |
| DIV/SQRT | 2.50 |
| Data deps. | 1.00 |
| Overall L1 | 2.50 |
| all | 0% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 0% |
| all | 20% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 0% |
| other | 50% |
| all | 16% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 0% |
| other | 33% |
| all | 12% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 12% |
| all | 10% |
| load | 6% |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 6% |
| other | 15% |
| all | 10% |
| load | 6% |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 6% |
| other | 14% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| VMOVSS -0x42e4(%RIP),%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VXORPS %XMM0,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| MOV %RAX,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.13 | scal (12.5%) |
| JMP 5205 <main+0x8a5> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| VDIVSS %XMM1,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 10 | 2.50 | scal (6.3%) |
| VUCOMISS -0x4578(%RIP),%XMM0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0.50 | 7 | 0.50 | scal (6.3%) |
| JAE 5437 <main+0xad7> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-0.50 | N/A |
| VMOVSS %XMM0,(%RBX,%R10,4) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (6.3%) |
| INC %R10 | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| CMP 0x88(%RSP),%R10 | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| JNE 5180 <main+0x820> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-0.50 | N/A |
| Function | main |
| Source file and lines | attention_v2.cpp:163-163 |
| Module | attention-aocc-znver5-512 |
| nb instructions | 22 |
| nb uops | 24 |
| loop length | 118 |
| used x86 registers | 6 |
| used mmx registers | 0 |
| used xmm registers | 2 |
| used ymm registers | 0 |
| used zmm registers | 0 |
| nb stack references | 4 |
| micro-operation queue | 3.00 cycles |
| front end | 3.00 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 0.67 | 0.67 | 0.67 | 1.33 | 1.33 | 1.33 | 2.75 | 2.75 | 2.75 | 2.75 | 0.50 | 0.50 | 0.50 | 0.50 | 1.00 | 1.00 |
| cycles | 0.67 | 0.67 | 0.67 | 1.33 | 1.33 | 1.33 | 2.75 | 2.75 | 2.75 | 2.75 | 0.50 | 0.50 | 0.50 | 0.50 | 1.00 | 1.00 |
| Cycles executing div or sqrt instructions | 2.50 |
| Longest recurrence chain latency (RecMII) | 1.00 |
| Front-end | 3.00 |
| Dispatch | 2.75 |
| DIV/SQRT | 2.50 |
| Data deps. | 1.00 |
| Overall L1 | 3.00 |
| all | 14% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 50% |
| all | 28% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 0% |
| other | 66% |
| all | 21% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 0% |
| other | 60% |
| all | 14% |
| load | 12% |
| store | 12% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 18% |
| all | 11% |
| load | 6% |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 6% |
| other | 18% |
| all | 12% |
| load | 8% |
| store | 10% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 6% |
| other | 18% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| VMOVSS -0x42e4(%RIP),%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VXORPS %XMM0,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| MOV %RAX,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.13 | scal (12.5%) |
| JMP 5205 <main+0x8a5> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| VDIVSS %XMM1,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 10 | 2.50 | scal (6.3%) |
| VUCOMISS -0x4578(%RIP),%XMM0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0.50 | 7 | 0.50 | scal (6.3%) |
| JAE 5437 <main+0xad7> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-0.50 | N/A |
| VMOVSS %XMM0,(%RBX,%R10,4) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (6.3%) |
| INC %R10 | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| CMP 0x88(%RSP),%R10 | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| JNE 5180 <main+0x820> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-0.50 | N/A |
| VMOVSS -0x459b(%RIP),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VXORPS %XMM1,%XMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| MOV %RSI,0x18(%RSP) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| MOV %RAX,0x240(%RSP) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| MOV %R10,0x1c0(%RSP) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| VZEROUPPER | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | vect (25.0%) |
| CALL 8900 <@plt_start@+0xf0> | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | N/A |
| MOV 0x1c0(%RSP),%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | N/A |
| MOV 0x240(%RSP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (12.5%) |
| MOV 0x18(%RSP),%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (12.5%) |
| JMP 541e <main+0xabe> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
