| Loop Id: 79 | Module: attention-armclang-native | Source: attention_v2.cpp:163-163 [...] | Coverage: 0.43% |
|---|
| Loop Id: 79 | Module: attention-armclang-native | Source: attention_v2.cpp:163-163 [...] | Coverage: 0.43% |
|---|
0x11920 MOVI D0, #0 |
0x11924 FMOV S1, #1.0000000 |
0x11928 ORR X8, XZR, X18 |
0x1192c B 119c8 |
(80) 0x11940 LDR X9, [SP, #5664] |
(80) 0x11944 LDR X10, [SP, #680] |
(80) 0x11948 LDR X11, [SP, #3848] |
(80) 0x1194c ORR X21, XZR, XZR |
(80) 0x11950 AND X9, X9, #0x0 |
(80) 0x11954 AND X12, X10, #0x0 |
(80) 0x11958 SBFM X10, X10, #0, #0 |
(80) 0x1195c ORR X9, X12, X9 |
(80) 0x11960 AND X10, X10, X27 |
(80) 0x11964 EOR X9, X11, X9,LSR #1 |
(80) 0x11968 EOR X9, X9, X10 |
(80) 0x1196c STR X9, [SP, #5664] |
(80) 0x11970 ORR X9, XZR, X21 |
(80) 0x11974 ADD X21, X21, #1 |
(80) 0x11978 ADD X10, SP, #680 |
(80) 0x1197c SUBS X8, X8, #1 |
(80) 0x11980 STR X21, [SP, #5672] |
(80) 0x11984 LDR X9, [X10, X9,LSL #3] |
(80) 0x11988 UBFM X10, X9, #11, #42 |
(80) 0x1198c EOR X9, X10, X9 |
(80) 0x11990 MOVZ W10, #22144 |
(80) 0x11994 MOVK W10, #40236 |
(80) 0x11998 AND X10, X10, X9,LSL #7 |
(80) 0x1199c EOR X9, X10, X9 |
(80) 0x119a0 MOVZ W10, #61382 |
(80) 0x119a4 AND X10, X10, X9,LSL #15 |
(80) 0x119a8 EOR X9, X10, X9 |
(80) 0x119ac EOR X9, X9, X9,LSR #18 |
(80) 0x119b0 UCVTF S2, X9 |
(80) 0x119b4 MOVZ W9, #20352 |
(80) 0x119b8 FMADD S0, S2, S1, S0 |
(80) 0x119bc FMOV S2, W9 |
(80) 0x119c0 FMUL S1, S1, S2 |
(80) 0x119c4 B.EQ 11bc4 |
(80) 0x119c8 CMP X21, #624 |
(80) 0x119cc B.CC 11970 |
(80) 0x119d0 ADD X15, SP, #680 |
(80) 0x119d4 DUPM Z22.D, #0x80000000 |
(80) 0x119d8 DUPM Z23.D, #0x7ffffffe |
(80) 0x119dc DUP Z24.D, #1 |
(80) 0x119e0 ORR X9, XZR, XZR |
(80) 0x119e4 LD1R {V2.2D}, [X15] |
(81) 0x119e8 ADD X10, X15, X9 |
(81) 0x119ec ADD X9, X9, #64 |
(81) 0x119f0 LDUR Q3, [X10, #8] |
(81) 0x119f4 LDUR Q4, [X10, #24] |
(81) 0x119f8 LDUR Q5, [X10, #40] |
(81) 0x119fc ADD X12, X10, #3192 |
(81) 0x11a00 ADD X11, X10, #3176 |
(81) 0x11a04 ADD X13, X10, #3208 |
(81) 0x11a08 ADD X14, X10, #3224 |
(81) 0x11a0c EXT V6.16B, V2.16B, V3.16B, #8 |
(81) 0x11a10 LDUR Q2, [X10, #56] |
(81) 0x11a14 EXT V7.16B, V3.16B, V4.16B, #8 |
(81) 0x11a18 AND V19.16B, V4.16B, V23.16B |
(81) 0x11a1c EXT V16.16B, V4.16B, V5.16B, #8 |
(81) 0x11a20 AND V18.16B, V3.16B, V23.16B |
(81) 0x11a24 AND V20.16B, V5.16B, V23.16B |
(81) 0x11a28 AND V3.16B, V3.16B, V24.16B |
(81) 0x11a2c AND V4.16B, V4.16B, V24.16B |
(81) 0x11a30 AND V7.16B, V7.16B, V22.16B |
(81) 0x11a34 AND V6.16B, V6.16B, V22.16B |
(81) 0x11a38 AND V16.16B, V16.16B, V22.16B |
(81) 0x11a3c CMEQ V3.2D, V3.2D, #0 |
(81) 0x11a40 CMEQ V4.2D, V4.2D, #0 |
(81) 0x11a44 ORR V7.16B, V19.16B, V7.16B |
(81) 0x11a48 LDR Q19, [X12] |
(81) 0x11a4c ORR V6.16B, V18.16B, V6.16B |
(81) 0x11a50 LDR Q18, [X11] |
(81) 0x11a54 ORR V16.16B, V20.16B, V16.16B |
(81) 0x11a58 LDR Q20, [X13] |
(81) 0x11a5c EXT V17.16B, V5.16B, V2.16B, #8 |
(81) 0x11a60 AND V21.16B, V2.16B, V23.16B |
(81) 0x11a64 USHR V7.2D, V7.2D, #1 |
(81) 0x11a68 USHR V6.2D, V6.2D, #1 |
(81) 0x11a6c USHR V16.2D, V16.2D, #1 |
(81) 0x11a70 AND V5.16B, V5.16B, V24.16B |
(81) 0x11a74 AND V17.16B, V17.16B, V22.16B |
(81) 0x11a78 CMEQ V5.2D, V5.2D, #0 |
(81) 0x11a7c ORR V17.16B, V21.16B, V17.16B |
(81) 0x11a80 LDR Q21, [X14] |
(81) 0x11a84 EOR V7.16B, V7.16B, V19.16B |
(81) 0x11a88 DUP V19.2D, X27 |
(81) 0x11a8c EOR V6.16B, V6.16B, V18.16B |
(81) 0x11a90 AND V18.16B, V2.16B, V24.16B |
(81) 0x11a94 EOR V16.16B, V16.16B, V20.16B |
(81) 0x11a98 USHR V17.2D, V17.2D, #1 |
(81) 0x11a9c BCAX V3.16B, V6.16B, V19.16B, V3.16B |
(81) 0x11aa0 CMEQ V6.2D, V18.2D, #0 |
(81) 0x11aa4 BCAX V4.16B, V7.16B, V19.16B, V4.16B |
(81) 0x11aa8 BCAX V5.16B, V16.16B, V19.16B, V5.16B |
(81) 0x11aac STP Q3, Q4, [X10] |
(81) 0x11ab0 EOR V17.16B, V17.16B, V21.16B |
(81) 0x11ab4 BCAX V6.16B, V17.16B, V19.16B, V6.16B |
(81) 0x11ab8 STP Q5, Q6, [X10, #32] |
(81) 0x11abc CMP X9, #1792 |
(81) 0x11ac0 B.NE 119e8 |
(80) 0x11ac4 LDR X11, [SP, #2480] |
(80) 0x11ac8 MOV X10, V2.D[1] |
(80) 0x11acc ORR X9, XZR, XZR |
(80) 0x11ad0 AND X10, X10, #0x0 |
(80) 0x11ad4 AND X12, X11, #0x0 |
(80) 0x11ad8 ORR X10, X12, X10 |
(80) 0x11adc LDR X12, [SP, #5648] |
(80) 0x11ae0 EOR X10, X12, X10,LSR #1 |
(80) 0x11ae4 SBFM X12, X11, #0, #0 |
(80) 0x11ae8 AND X12, X12, X27 |
(80) 0x11aec EOR X10, X10, X12 |
(80) 0x11af0 STR X10, [SP, #2472] |
(80) 0x11af4 AND X10, X11, #0x0 |
(80) 0x11af8 LDR X11, [SP, #2488] |
(80) 0x11afc AND X12, X11, #0x0 |
(80) 0x11b00 ORR X10, X12, X10 |
(80) 0x11b04 LDR X12, [SP, #5656] |
(80) 0x11b08 EOR X10, X12, X10,LSR #1 |
(80) 0x11b0c SBFM X12, X11, #0, #0 |
(80) 0x11b10 AND X12, X12, X27 |
(80) 0x11b14 EOR X10, X10, X12 |
(80) 0x11b18 STR X10, [SP, #2480] |
(80) 0x11b1c AND X10, X11, #0x0 |
(80) 0x11b20 LDR X11, [SP, #2496] |
(80) 0x11b24 AND X12, X11, #0x0 |
(80) 0x11b28 DUP V2.2D, X11 |
(80) 0x11b2c ORR X10, X12, X10 |
(80) 0x11b30 LDR X12, [SP, #5664] |
(80) 0x11b34 EOR X10, X12, X10,LSR #1 |
(80) 0x11b38 SBFM X12, X11, #0, #0 |
(80) 0x11b3c AND X12, X12, X27 |
(80) 0x11b40 EOR X10, X10, X12 |
(80) 0x11b44 STR X10, [SP, #2488] |
(82) 0x11b48 ADD X10, X15, X9 |
(82) 0x11b4c DUP V6.2D, X27 |
(82) 0x11b50 ADD X9, X9, #32 |
(82) 0x11b54 LDR Q3, [X10, #1824] |
(82) 0x11b58 ADD X11, X10, #1816 |
(82) 0x11b5c EXT V2.16B, V2.16B, V3.16B, #8 |
(82) 0x11b60 AND V4.16B, V3.16B, V23.16B |
(82) 0x11b64 AND V2.16B, V2.16B, V22.16B |
(82) 0x11b68 ORR V2.16B, V4.16B, V2.16B |
(82) 0x11b6c LDP Q4, Q5, [X10] |
(82) 0x11b70 USHR V2.2D, V2.2D, #1 |
(82) 0x11b74 EOR V2.16B, V2.16B, V4.16B |
(82) 0x11b78 AND V4.16B, V3.16B, V24.16B |
(82) 0x11b7c CMEQ V4.2D, V4.2D, #0 |
(82) 0x11b80 BCAX V2.16B, V2.16B, V6.16B, V4.16B |
(82) 0x11b84 STR Q2, [X11] |
(82) 0x11b88 LDR Q2, [X10, #1840] |
(82) 0x11b8c ADD X10, X10, #1832 |
(82) 0x11b90 EXT V3.16B, V3.16B, V2.16B, #8 |
(82) 0x11b94 AND V4.16B, V2.16B, V23.16B |
(82) 0x11b98 AND V3.16B, V3.16B, V22.16B |
(82) 0x11b9c ORR V3.16B, V4.16B, V3.16B |
(82) 0x11ba0 AND V4.16B, V2.16B, V24.16B |
(82) 0x11ba4 USHR V3.2D, V3.2D, #1 |
(82) 0x11ba8 CMEQ V4.2D, V4.2D, #0 |
(82) 0x11bac EOR V3.16B, V3.16B, V5.16B |
(82) 0x11bb0 BCAX V3.16B, V3.16B, V6.16B, V4.16B |
(82) 0x11bb4 STR Q3, [X10] |
(82) 0x11bb8 CMP X9, #3168 |
(82) 0x11bbc B.NE 11b48 |
(80) 0x11bc0 B 11940 |
0x11bc4 FDIV S0, S0, S1 |
0x11bc8 FCMP S0, S10 |
0x11bcc B.GE 11be8 |
0x11bd0 LDR X8, [SP, #352] |
0x11bd4 STR S0, [X19, X16,LSL #2] |
0x11bd8 ADD X16, X16, #1 |
0x11bdc CMP X16, X8 |
0x11be0 B.NE 11920 |
0x11be8 FMOV S0, #1.0000000 |
0x11bec MOVI D1, #0 |
0x11bf0 STR X17, [SP, #624] |
0x11bf4 STR X16, [SP, #616] |
0x11bf8 STR X18, [SP, #592] |
0x11bfc BL 10140 |
0x11c00 LDR X18, [SP, #592] |
0x11c04 LDR X16, [SP, #616] |
0x11c08 LDR X17, [SP, #624] |
0x11c0c LDR X8, [SP, #352] |
0x11c10 STR S0, [X19, X16,LSL #2] |
0x11c14 ADD X16, X16, #1 |
0x11c18 CMP X16, X8 |
0x11c1c B.NE 11920 |
/usr/lib/gcc/aarch64-amazon-linux/14/../../../../include/c++/14/cmath: 2622 - 2622 |
-------------------------------------------------------------------------------- |
2622: { return __builtin_nextafterf(__x, __y); } |
/home/eoseret/llm-attention/attention_v2.cpp: 163 - 163 |
-------------------------------------------------------------------------------- |
163: for (size_t i = 0; i < elemsX; ++i) h_X[i] = dist(rng); |
/usr/lib/gcc/aarch64-amazon-linux/14/../../../../include/c++/14/bits/random.tcc: 404 - 3371 |
-------------------------------------------------------------------------------- |
404: for (size_t __k = 0; __k < (__n - __m); ++__k) |
405: { |
406: _UIntType __y = ((_M_x[__k] & __upper_mask) |
407: | (_M_x[__k + 1] & __lower_mask)); |
408: _M_x[__k] = (_M_x[__k + __m] ^ (__y >> 1) |
409: ^ ((__y & 0x01) ? __a : 0)); |
410: } |
411: |
412: for (size_t __k = (__n - __m); __k < (__n - 1); ++__k) |
413: { |
414: _UIntType __y = ((_M_x[__k] & __upper_mask) |
415: | (_M_x[__k + 1] & __lower_mask)); |
416: _M_x[__k] = (_M_x[__k + (__m - __n)] ^ (__y >> 1) |
417: ^ ((__y & 0x01) ? __a : 0)); |
418: } |
419: |
420: _UIntType __y = ((_M_x[__n - 1] & __upper_mask) |
421: | (_M_x[0] & __lower_mask)); |
422: _M_x[__n - 1] = (_M_x[__m - 1] ^ (__y >> 1) |
423: ^ ((__y & 0x01) ? __a : 0)); |
[...] |
458: if (_M_p >= state_size) |
459: _M_gen_rand(); |
460: |
461: // Calculate o(x(i)). |
462: result_type __z = _M_x[_M_p++]; |
463: __z ^= (__z >> __u) & __d; |
464: __z ^= (__z << __s) & __b; |
465: __z ^= (__z << __t) & __c; |
466: __z ^= (__z >> __l); |
[...] |
3365: for (size_t __k = __m; __k != 0; --__k) |
3366: { |
3367: __sum += _RealType(__urng() - __urng.min()) * __tmp; |
3368: __tmp *= __r; |
3369: } |
3370: __ret = __sum / __tmp; |
3371: if (__builtin_expect(__ret >= _RealType(1), 0)) |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | __libc_start_call_main | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | attention-armclang-native |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 2.10 |
| CQA speedup if FP arith vectorized | 1.35 |
| CQA speedup if fully vectorized | 2.80 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.05 |
| Bottlenecks | P12, |
| Function | main |
| Source | cmath:2622-2622,attention_v2.cpp:163-163,random.tcc:3370-3371 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 3.50 |
| CQA cycles if no scalar integer | 1.67 |
| CQA cycles if FP arith vectorized | 2.58 |
| CQA cycles if fully vectorized | 1.25 |
| Front-end cycles | 3.25 |
| P0 cycles | 2.50 |
| P1 cycles | 2.50 |
| P2 cycles | 1.08 |
| P3 cycles | 1.08 |
| P4 cycles | 1.00 |
| P5 cycles | 1.00 |
| P6 cycles | 0.92 |
| P7 cycles | 0.92 |
| P8 cycles | 2.00 |
| P9 cycles | 2.00 |
| P10 cycles | 2.00 |
| P11 cycles | 2.00 |
| P12 cycles | 3.50 |
| P13 cycles | 3.17 |
| P14 cycles | 3.33 |
| P15 cycles | 1.50 |
| P16 cycles | 1.50 |
| DIV/SQRT cycles | 0.87 - 1.13 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 26.00 |
| Nb uops | 26.00 |
| Nb loads | NA |
| Nb stores | 5.00 |
| Nb stack references | 8.00 |
| FLOP/cycle | 0.29 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 1.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 1.71 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 0.00 |
| Bytes stored | 6.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 0.00 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | NA |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | 0.00 |
| Vectorization ratio other | 0.00 |
| Vector-efficiency ratio all | 39.29 |
| Vector-efficiency ratio load | 50.00 |
| Vector-efficiency ratio store | 40.00 |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | NA |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | 25.00 |
| Vector-efficiency ratio other | 37.50 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 2.10 |
| CQA speedup if FP arith vectorized | 1.35 |
| CQA speedup if fully vectorized | 2.80 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.05 |
| Bottlenecks | P12, |
| Function | main |
| Source | cmath:2622-2622,attention_v2.cpp:163-163,random.tcc:3370-3371 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 3.50 |
| CQA cycles if no scalar integer | 1.67 |
| CQA cycles if FP arith vectorized | 2.58 |
| CQA cycles if fully vectorized | 1.25 |
| Front-end cycles | 3.25 |
| P0 cycles | 2.50 |
| P1 cycles | 2.50 |
| P2 cycles | 1.08 |
| P3 cycles | 1.08 |
| P4 cycles | 1.00 |
| P5 cycles | 1.00 |
| P6 cycles | 0.92 |
| P7 cycles | 0.92 |
| P8 cycles | 2.00 |
| P9 cycles | 2.00 |
| P10 cycles | 2.00 |
| P11 cycles | 2.00 |
| P12 cycles | 3.50 |
| P13 cycles | 3.17 |
| P14 cycles | 3.33 |
| P15 cycles | 1.50 |
| P16 cycles | 1.50 |
| DIV/SQRT cycles | 0.87 - 1.13 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 26.00 |
| Nb uops | 26.00 |
| Nb loads | NA |
| Nb stores | 5.00 |
| Nb stack references | 8.00 |
| FLOP/cycle | 0.29 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 1.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 1.71 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 0.00 |
| Bytes stored | 6.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 0.00 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | NA |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | 0.00 |
| Vectorization ratio other | 0.00 |
| Vector-efficiency ratio all | 39.29 |
| Vector-efficiency ratio load | 50.00 |
| Vector-efficiency ratio store | 40.00 |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | NA |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | 25.00 |
| Vector-efficiency ratio other | 37.50 |
| Path / |
| Function | main |
| Source file and lines | attention_v2.cpp:163-163 |
| Module | attention-armclang-native |
| nb instructions | 26 |
| nb uops | 26 |
| loop length | 104 |
| used w registers | 0 |
| used x registers | 6 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 4 |
| used d registers | 2 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 0 |
| nb stack references | 8 |
| micro-operation queue | 3.25 cycles |
| front end | 3.25 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | P16 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 2.50 | 2.50 | 1.08 | 1.08 | 1.00 | 1.00 | 0.92 | 0.92 | 2.00 | 2.00 | 2.00 | 2.00 | 3.50 | 3.17 | 3.33 | 1.50 | 1.50 |
| cycles | 2.50 | 2.50 | 1.08 | 1.08 | 1.00 | 1.00 | 0.92 | 0.92 | 2.00 | 2.00 | 2.00 | 2.00 | 3.50 | 3.17 | 3.33 | 1.50 | 1.50 |
| Cycles executing div or sqrt instructions | 0.87-1.13 |
| Front-end | 3.25 |
| Dispatch | 3.50 |
| DIV/SQRT | 0.87-1.13 |
| Overall L1 | 3.50 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 0% |
| all | 0% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 0% |
| other | 0% |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 0% |
| other | 0% |
| all | 45% |
| load | 50% |
| store | 40% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 50% |
| all | 25% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 25% |
| other | 25% |
| all | 39% |
| load | 50% |
| store | 40% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 25% |
| other | 37% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | P16 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOVI D0, #0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 2 | 0.25 | scal (50.0%) |
| FMOV S1, #1.0000000 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 2 | 0.25 | scal (25.0%) |
| ORR X8, XZR, X18 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| B 119c8 <main+0x7a8> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| FDIV S0, S0, S1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 7-10 | 0.87-1.13 | scal (25.0%) |
| FCMP S0, S10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| B.GE 11be8 <main+0x9c8> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDR X8, [SP, #352] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| STR S0, [X19, X16,LSL #2] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 2 | 0.50 | scal (25.0%) |
| ADD X16, X16, #1 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| CMP X16, X8 | 1 | 0 | 0 | 0.25 | 0.25 | 0 | 0 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (50.0%) |
| B.NE 11920 <main+0x700> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| FMOV S0, #1.0000000 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 2 | 0.25 | scal (25.0%) |
| MOVI D1, #0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 2 | 0.25 | scal (50.0%) |
| STR X17, [SP, #624] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STR X16, [SP, #616] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STR X18, [SP, #592] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| BL 10140 <@plt_start@+0x120> | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDR X18, [SP, #592] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (50.0%) |
| LDR X16, [SP, #616] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDR X17, [SP, #624] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (50.0%) |
| LDR X8, [SP, #352] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| STR S0, [X19, X16,LSL #2] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 2 | 0.50 | scal (25.0%) |
| ADD X16, X16, #1 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| CMP X16, X8 | 1 | 0 | 0 | 0.25 | 0.25 | 0 | 0 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | N/A |
| B.NE 11920 <main+0x700> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| Function | main |
| Source file and lines | attention_v2.cpp:163-163 |
| Module | attention-armclang-native |
| nb instructions | 26 |
| nb uops | 26 |
| loop length | 104 |
| used w registers | 0 |
| used x registers | 6 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 4 |
| used d registers | 2 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 0 |
| nb stack references | 8 |
| micro-operation queue | 3.25 cycles |
| front end | 3.25 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | P16 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 2.50 | 2.50 | 1.08 | 1.08 | 1.00 | 1.00 | 0.92 | 0.92 | 2.00 | 2.00 | 2.00 | 2.00 | 3.50 | 3.17 | 3.33 | 1.50 | 1.50 |
| cycles | 2.50 | 2.50 | 1.08 | 1.08 | 1.00 | 1.00 | 0.92 | 0.92 | 2.00 | 2.00 | 2.00 | 2.00 | 3.50 | 3.17 | 3.33 | 1.50 | 1.50 |
| Cycles executing div or sqrt instructions | 0.87-1.13 |
| Front-end | 3.25 |
| Dispatch | 3.50 |
| DIV/SQRT | 0.87-1.13 |
| Overall L1 | 3.50 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 0% |
| all | 0% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 0% |
| other | 0% |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 0% |
| other | 0% |
| all | 45% |
| load | 50% |
| store | 40% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 50% |
| all | 25% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 25% |
| other | 25% |
| all | 39% |
| load | 50% |
| store | 40% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 25% |
| other | 37% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | P16 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOVI D0, #0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 2 | 0.25 | scal (50.0%) |
| FMOV S1, #1.0000000 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 2 | 0.25 | scal (25.0%) |
| ORR X8, XZR, X18 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| B 119c8 <main+0x7a8> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| FDIV S0, S0, S1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 7-10 | 0.87-1.13 | scal (25.0%) |
| FCMP S0, S10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| B.GE 11be8 <main+0x9c8> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDR X8, [SP, #352] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| STR S0, [X19, X16,LSL #2] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 2 | 0.50 | scal (25.0%) |
| ADD X16, X16, #1 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| CMP X16, X8 | 1 | 0 | 0 | 0.25 | 0.25 | 0 | 0 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (50.0%) |
| B.NE 11920 <main+0x700> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| FMOV S0, #1.0000000 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 2 | 0.25 | scal (25.0%) |
| MOVI D1, #0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 2 | 0.25 | scal (50.0%) |
| STR X17, [SP, #624] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STR X16, [SP, #616] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STR X18, [SP, #592] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| BL 10140 <@plt_start@+0x120> | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDR X18, [SP, #592] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (50.0%) |
| LDR X16, [SP, #616] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDR X17, [SP, #624] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (50.0%) |
| LDR X8, [SP, #352] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| STR S0, [X19, X16,LSL #2] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 2 | 0.50 | scal (25.0%) |
| ADD X16, X16, #1 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| CMP X16, X8 | 1 | 0 | 0 | 0.25 | 0.25 | 0 | 0 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | N/A |
| B.NE 11920 <main+0x700> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
