| Function: softmax(float const*, float*, float*, int) | Module: attention-gcc-native | Source: attention_v2.cpp:42-63 | Coverage (incl. loops): 0.87% | (excl. loops): 0.00% |
|---|
| Function: softmax(float const*, float*, float*, int) | Module: attention-gcc-native | Source: attention_v2.cpp:42-63 | Coverage (incl. loops): 0.87% | (excl. loops): 0.00% |
|---|
/home/eoseret/llm-attention/attention_v2.cpp: 42 - 63 |
-------------------------------------------------------------------------------- |
42: { |
43: for (int row = 0; row < N; ++row) { |
44: const float *S_row = &S[row * N]; |
45: |
46: float max_val = -FLT_MAX; |
47: for (int idx = 0; idx <= row; ++idx) // vectorised |
48: if (S_row[idx] > max_val) max_val = S_row[idx]; |
49: |
50: float sum = 0.0f; |
51: #pragma clang loop vectorize(enable) |
52: for (int idx = 0; idx <= row; ++idx) // vectorised |
53: sum += expf(S_row[idx] - max_val); |
54: |
55: for (int idx = 0; idx <= row; ++idx) //vectorised |
56: P[row * N + idx] = expf(S_row[idx] - max_val) / sum; |
57: |
58: for (int idx = row + 1; idx < N; ++idx) |
59: P[row * N + idx] = 0.0f; |
60: |
61: D[row] = sum; |
62: } |
63: } |
0x4119e0 STP X29, X30, [SP, #864]! |
0x4119e4 ADD X29, SP, #0 |
0x4119e8 STR W3, [SP, #132] |
0x4119ec STR X0, [SP, #136] |
0x4119f0 STR X1, [SP, #152] |
0x4119f4 CMP W3, #0 |
0x4119f8 B.LE 411bd8 |
0x4119fc STP X19, X20, [SP, #16] |
0x411a00 ORR X20, XZR, X0 |
0x411a04 MOVZ X19, #1 |
0x411a08 STP X21, X22, [SP, #32] |
0x411a0c ORR X21, XZR, X2 |
0x411a10 ADD X22, X0, #4 |
0x411a14 ADD X2, X2, #4 |
0x411a18 SUB X0, X1, #8 |
0x411a1c ORR W1, WZR, W19 |
0x411a20 STP X25, X26, [SP, #64] |
0x411a24 ADD W26, W3, #1 |
0x411a28 UBFM X26, X26, #62, #61 |
0x411a2c STP X23, X24, [SP, #48] |
0x411a30 MOVZ X24, #0 |
0x411a34 CNTW X23, ALL |
0x411a38 STP X27, X28, [SP, #80] |
0x411a3c SBFM X28, X3, #0, #31 |
0x411a40 SUB X27, X26, #4 |
0x411a44 STP D14, D15, [SP, #96] |
0x411a48 STR X2, [SP, #112] |
0x411a4c STR X0, [SP, #144] |
0x411a50 SUB W0, W19, #1 |
0x411a54 CMP W0, #14 |
0x411a58 B.LS 411ba8 |
0x411a5c HINT #0 |
(33) 0x411a60 MVNI V30.4S, #128 |
(33) 0x411a64 UBFM W2, W19, #4, #31 |
(33) 0x411a68 MOVZ W3, #64 |
(33) 0x411a6c ORR X0, XZR, X20 |
(33) 0x411a70 UMADDL X2, W2, W3, X20 |
(33) 0x411a74 ORR V31.16B, V30.16B, V30.16B |
(33) 0x411a78 ORR V28.16B, V30.16B, V30.16B |
(33) 0x411a7c ORR V29.16B, V30.16B, V30.16B |
(32) 0x411a80 LDP Q24, Q25, [X0] |
(32) 0x411a84 LDP Q26, Q27, [X0, #32] |
(32) 0x411a88 ADD X0, X0, #64 |
(32) 0x411a8c FMAXNM V29.4S, V29.4S, V24.4S |
(32) 0x411a90 FMAXNM V28.4S, V28.4S, V25.4S |
(32) 0x411a94 FMAXNM V31.4S, V31.4S, V26.4S |
(32) 0x411a98 FMAXNM V30.4S, V30.4S, V27.4S |
(32) 0x411a9c CMP X0, X2 |
(32) 0x411aa0 B.NE 411a80 |
(33) 0x411aa4 FMAXNM V15.4S, V31.4S, V30.4S |
(33) 0x411aa8 AND W3, W1, #0xfffffff0 |
(33) 0x411aac FMAXNM V29.4S, V29.4S, V28.4S |
(33) 0x411ab0 FMAXNM V15.4S, V15.4S, V29.4S |
(33) 0x411ab4 FMAXNMV S15, V15.4S |
(33) 0x411ab8 ANDS XZR, X1, #0xf |
(33) 0x411abc B.EQ 411af8 |
(34) 0x411ac0 LDR X4, [SP, #136] |
(34) 0x411ac4 ADD X2, X24, W3,UXTW |
(34) 0x411ac8 MOVZ X0, #0 |
(34) 0x411acc SUB W1, W1, W3 |
(34) 0x411ad0 DUP Z15.S, Z15.S[0] |
(34) 0x411ad4 WHILELO P7.S, WZR, W1 |
(34) 0x411ad8 ADD X2, X4, X2,LSL #2 |
(29) 0x411adc LD1W {Z31.S}, P7/Z, [X2, X0,LSL #2] |
(29) 0x411ae0 ADD X0, X0, X23 |
(29) 0x411ae4 FMAXNM Z15.S, P7/M, Z15.S, Z31.S |
(29) 0x411ae8 WHILELO P7.S, W0, W1 |
(29) 0x411aec B.NE 411adc |
(34) 0x411af0 PTRUE P7.B, ALL |
(34) 0x411af4 FMAXNMV S15, P7, Z15.S |
(33) 0x411af8 MOVI V14.2S, #0 |
(33) 0x411afc ORR X25, XZR, X20 |
(30) 0x411b00 LDR S0, [X25], #4 |
(30) 0x411b04 FSUB S0, S0, S15 |
(30) 0x411b08 BL 410140 |
(30) 0x411b0c FADD S14, S14, S0 |
(30) 0x411b10 CMP X22, X25 |
(30) 0x411b14 B.NE 411b00 |
(33) 0x411b18 MOVZ X25, #0 |
(33) 0x411b1c HINT #0 |
(31) 0x411b20 LDR S0, [X20, X25,LSL #2] |
(31) 0x411b24 FSUB S0, S0, S15 |
(31) 0x411b28 BL 410140 |
(31) 0x411b2c FDIV S0, S0, S14 |
(31) 0x411b30 STR S0, [X21, X25,LSL #2] |
(31) 0x411b34 ADD X25, X25, #1 |
(31) 0x411b38 CMP X19, X25 |
(31) 0x411b3c B.NE 411b20 |
(33) 0x411b40 LDR W0, [SP, #132] |
(33) 0x411b44 ADD X20, X20, X27 |
(33) 0x411b48 ADD X21, X21, X27 |
(33) 0x411b4c ADD X22, X22, X26 |
(33) 0x411b50 ADD X24, X24, X28 |
(33) 0x411b54 ADD X3, X19, #1 |
(33) 0x411b58 CMP X28, X19 |
(33) 0x411b5c B.EQ 411bb4 |
(33) 0x411b60 LDR X19, [SP, #112] |
(33) 0x411b64 SUB W2, W0, W3 |
(33) 0x411b68 MOVZ W1, #0 |
(33) 0x411b6c ADD X2, X2, #1 |
(33) 0x411b70 STR X3, [SP, #120] |
(33) 0x411b74 UBFM X2, X2, #62, #61 |
(33) 0x411b78 ORR X0, XZR, X19 |
(33) 0x411b7c BL 410060 |
(33) 0x411b80 ADD X0, X19, X26 |
(33) 0x411b84 LDR X3, [SP, #120] |
(33) 0x411b88 STR X0, [SP, #112] |
(33) 0x411b8c LDR X0, [SP, #144] |
(33) 0x411b90 ORR X19, XZR, X3 |
(33) 0x411b94 ORR W1, WZR, W19 |
(33) 0x411b98 STR S14, [X0, X3,LSL #2] |
(33) 0x411b9c SUB W0, W19, #1 |
(33) 0x411ba0 CMP W0, #14 |
(33) 0x411ba4 B.HI 411a60 |
(35) 0x411ba8 MVNI V15.2S, #128 |
(35) 0x411bac MOVZ W3, #0 |
(35) 0x411bb0 B 411ac0 |
0x411bb4 LDR X1, [SP, #152] |
0x411bb8 SUB W0, W0, #1 |
0x411bbc STR S14, [X1, X0,LSL #2] |
0x411bc0 LDP D14, D15, [SP, #96] |
0x411bc4 LDP X19, X20, [SP, #16] |
0x411bc8 LDP X21, X22, [SP, #32] |
0x411bcc LDP X23, X24, [SP, #48] |
0x411bd0 LDP X25, X26, [SP, #64] |
0x411bd4 LDP X27, X28, [SP, #80] |
0x411bd8 LDP X29, X30, [SP], #160 |
0x411bdc RET |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | attention_v2.cpp:283 | attention-gcc-native |
| ○ | __libc_start_call_main | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | stl_vector.h:1920 | attention-gcc-native |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
The code analyzed by CQA in that panel excludes loops and represents 0.00% of application time for run run_0
| Source file and lines | attention_v2.cpp:42-63 |
| Module | attention-gcc-native |
| nb instructions | 43 |
| nb uops | 42 |
| loop length | 172 |
| used w registers | 6 |
| used x registers | 17 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 2 |
| used d registers | 2 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 0 |
| nb stack references | 20 |
| micro-operation queue | 5.25 cycles |
| front end | 5.25 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | P16 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 1.50 | 1.50 | 3.25 | 3.25 | 3.17 | 3.17 | 3.08 | 3.08 | 1.00 | 1.00 | 0.00 | 0.00 | 7.17 | 6.83 | 7.00 | 5.50 | 5.50 |
| cycles | 1.50 | 1.50 | 3.25 | 3.25 | 3.17 | 3.17 | 3.08 | 3.08 | 1.00 | 1.00 | 0.00 | 0.00 | 7.17 | 6.83 | 7.00 | 5.50 | 5.50 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 5.25 |
| Dispatch | 7.17 |
| Overall L1 | 7.17 |
| all | 5% |
| load | 14% |
| store | 7% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 68% |
| load | 100% |
| store | 73% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 45% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 50% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | P16 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| STP X29, X30, [SP, #864]! | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| ADD X29, SP, #0 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (50.0%) |
| STR W3, [SP, #132] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| STR X0, [SP, #136] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STR X1, [SP, #152] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| CMP W3, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0 | 0 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (25.0%) |
| B.LE 411bd8 <_Z7softmaxPKfPfS1_i+0x1f8> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| STP X19, X20, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| ORR X20, XZR, X0 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (50.0%) |
| MOVZ X19, #1 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (50.0%) |
| STP X21, X22, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| ORR X21, XZR, X2 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (50.0%) |
| ADD X22, X0, #4 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (50.0%) |
| ADD X2, X2, #4 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (50.0%) |
| SUB X0, X1, #8 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| ORR W1, WZR, W19 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| STP X25, X26, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| ADD W26, W3, #1 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (25.0%) |
| UBFM X26, X26, #62, #61 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (50.0%) |
| STP X23, X24, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| MOVZ X24, #0 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (50.0%) |
| CNTW X23, ALL | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | scal (50.0%) |
| STP X27, X28, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| SBFM X28, X3, #0, #31 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (100.0%) |
| SUB X27, X26, #4 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (50.0%) |
| STP D14, D15, [SP, #96] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 2 | 0.50 | vect (100.0%) |
| STR X2, [SP, #112] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STR X0, [SP, #144] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| SUB W0, W19, #1 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| CMP W0, #14 | 1 | 0 | 0 | 0.25 | 0.25 | 0 | 0 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (25.0%) |
| B.LS 411ba8 <_Z7softmaxPKfPfS1_i+0x1c8> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| HINT #0 | N/A | ||||||||||||||||||||
| LDR X1, [SP, #152] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| SUB W0, W0, #1 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| STR S14, [X1, X0,LSL #2] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 2 | 0.50 | scal (25.0%) |
| LDP D14, D15, [SP, #96] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 | vect (100.0%) |
| LDP X19, X20, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| LDP X21, X22, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| LDP X23, X24, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| LDP X25, X26, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| LDP X27, X28, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| LDP X29, X30, [SP], #160 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| RET | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
The code analyzed by CQA in that panel excludes loops and represents 0.00% of application time for run run_0
| Source file and lines | attention_v2.cpp:42-63 |
| Module | attention-gcc-native |
| nb instructions | 43 |
| nb uops | 42 |
| loop length | 172 |
| used w registers | 6 |
| used x registers | 17 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 2 |
| used d registers | 2 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 0 |
| nb stack references | 20 |
| micro-operation queue | 5.25 cycles |
| front end | 5.25 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | P16 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 1.50 | 1.50 | 3.25 | 3.25 | 3.17 | 3.17 | 3.08 | 3.08 | 1.00 | 1.00 | 0.00 | 0.00 | 7.17 | 6.83 | 7.00 | 5.50 | 5.50 |
| cycles | 1.50 | 1.50 | 3.25 | 3.25 | 3.17 | 3.17 | 3.08 | 3.08 | 1.00 | 1.00 | 0.00 | 0.00 | 7.17 | 6.83 | 7.00 | 5.50 | 5.50 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 5.25 |
| Dispatch | 7.17 |
| Overall L1 | 7.17 |
| all | 5% |
| load | 14% |
| store | 7% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 68% |
| load | 100% |
| store | 73% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 45% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 50% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | P16 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| STP X29, X30, [SP, #864]! | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| ADD X29, SP, #0 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (50.0%) |
| STR W3, [SP, #132] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| STR X0, [SP, #136] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STR X1, [SP, #152] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| CMP W3, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0 | 0 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (25.0%) |
| B.LE 411bd8 <_Z7softmaxPKfPfS1_i+0x1f8> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| STP X19, X20, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| ORR X20, XZR, X0 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (50.0%) |
| MOVZ X19, #1 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (50.0%) |
| STP X21, X22, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| ORR X21, XZR, X2 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (50.0%) |
| ADD X22, X0, #4 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (50.0%) |
| ADD X2, X2, #4 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (50.0%) |
| SUB X0, X1, #8 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| ORR W1, WZR, W19 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| STP X25, X26, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| ADD W26, W3, #1 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (25.0%) |
| UBFM X26, X26, #62, #61 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (50.0%) |
| STP X23, X24, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| MOVZ X24, #0 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (50.0%) |
| CNTW X23, ALL | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | scal (50.0%) |
| STP X27, X28, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| SBFM X28, X3, #0, #31 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (100.0%) |
| SUB X27, X26, #4 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (50.0%) |
| STP D14, D15, [SP, #96] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 2 | 0.50 | vect (100.0%) |
| STR X2, [SP, #112] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STR X0, [SP, #144] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| SUB W0, W19, #1 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| CMP W0, #14 | 1 | 0 | 0 | 0.25 | 0.25 | 0 | 0 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (25.0%) |
| B.LS 411ba8 <_Z7softmaxPKfPfS1_i+0x1c8> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| HINT #0 | N/A | ||||||||||||||||||||
| LDR X1, [SP, #152] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| SUB W0, W0, #1 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| STR S14, [X1, X0,LSL #2] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 2 | 0.50 | scal (25.0%) |
| LDP D14, D15, [SP, #96] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 | vect (100.0%) |
| LDP X19, X20, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| LDP X21, X22, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| LDP X23, X24, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| LDP X25, X26, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| LDP X27, X28, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| LDP X29, X30, [SP], #160 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| RET | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| Name | Coverage (%) | Time (s) |
|---|---|---|
| ▼softmax(float const*, float*, float*, int)– | 0.87 | 0.08 |
| ▼Loop 35 - attention_v2.cpp:44-61 - attention-gcc-native– | 0.00 | 0.00 |
| ▼Loop 34 - attention_v2.cpp:44-61 - attention-gcc-native– | 0.00 | 0.00 |
| ▼Loop 33 - attention_v2.cpp:44-61 - attention-gcc-native– | 0.06 | 0.00 |
| ○Loop 31 - attention_v2.cpp:55-56 - attention-gcc-native | 0.46 | 0.04 |
| ○Loop 30 - attention_v2.cpp:52-53 - attention-gcc-native | 0.23 | 0.02 |
| ○Loop 32 - attention_v2.cpp:47-48 - attention-gcc-native | 0.06 | 0.00 |
| ○Loop 29 - attention_v2.cpp:47-48 - attention-gcc-native | 0.06 | 0.00 |
