| Function: softmax(float const*, float*, float*, int) | Module: attention-gcc-native | Source: attention_v2.cpp:42-63 | Coverage (incl. loops): 0.92% | (excl. loops): 0.00% |
|---|
| Function: softmax(float const*, float*, float*, int) | Module: attention-gcc-native | Source: attention_v2.cpp:42-63 | Coverage (incl. loops): 0.92% | (excl. loops): 0.00% |
|---|
/home/eoseret/llm-attention/attention_v2.cpp: 42 - 63 |
-------------------------------------------------------------------------------- |
42: { |
43: for (int row = 0; row < N; ++row) { |
44: const float *S_row = &S[row * N]; |
45: |
46: float max_val = -FLT_MAX; |
47: for (int idx = 0; idx <= row; ++idx) // vectorised |
48: if (S_row[idx] > max_val) max_val = S_row[idx]; |
49: |
50: float sum = 0.0f; |
51: #pragma clang loop vectorize(enable) |
52: for (int idx = 0; idx <= row; ++idx) // vectorised |
53: sum += expf(S_row[idx] - max_val); |
54: |
55: for (int idx = 0; idx <= row; ++idx) //vectorised |
56: P[row * N + idx] = expf(S_row[idx] - max_val) / sum; |
57: |
58: for (int idx = row + 1; idx < N; ++idx) |
59: P[row * N + idx] = 0.0f; |
60: |
61: D[row] = sum; |
62: } |
63: } |
0x4119e0 STP X29, X30, [SP, #848]! |
0x4119e4 ADD X29, SP, #0 |
0x4119e8 STR X1, [SP, #152] |
0x4119ec STR W3, [SP, #164] |
0x4119f0 STR X0, [SP, #168] |
0x4119f4 CMP W3, #0 |
0x4119f8 B.LE 411c18 |
0x4119fc ORR X4, XZR, X0 |
0x411a00 ADD W0, W3, #1 |
0x411a04 STP X19, X20, [SP, #16] |
0x411a08 UBFM X0, X0, #62, #61 |
0x411a0c MOVZ X19, #1 |
0x411a10 PTRUE P7.B, ALL |
0x411a14 CNTB X1, ALL |
0x411a18 ORR X20, XZR, X4 |
0x411a1c STR X0, [SP, #144] |
0x411a20 SUB X0, X0, #4 |
0x411a24 ADD W1, W1, #15 |
0x411a28 STP X21, X22, [SP, #32] |
0x411a2c ORR X21, XZR, X2 |
0x411a30 ADD X22, X4, #4 |
0x411a34 STR X0, [SP, #128] |
0x411a38 SBFM X0, X3, #0, #31 |
0x411a3c STP X23, X24, [SP, #48] |
0x411a40 RDVL X23, #63 |
0x411a44 CNTW X24, ALL |
0x411a48 ADD W23, W23, #1 |
0x411a4c STR X0, [SP, #136] |
0x411a50 ADD X0, X2, #4 |
0x411a54 ORR W2, WZR, W19 |
0x411a58 STP X25, X26, [SP, #64] |
0x411a5c CNTB X25, ALL |
0x411a60 CNTB X26, ALL, MUL #4 |
0x411a64 STR X0, [SP, #112] |
0x411a68 SUB W0, W19, #1 |
0x411a6c CMP W0, W1 |
0x411a70 STP X27, X28, [SP, #80] |
0x411a74 MOVZ W1, #0 |
0x411a78 MOVZ X27, #0 |
0x411a7c STP D14, D15, [SP, #96] |
0x411a80 B.CC 411bec |
(33) 0x411a84 ORR X0, XZR, X20 |
(33) 0x411a88 DUPM Z30.S, #0xff7fffff |
(33) 0x411a8c ORR Z31.D, Z30.D, Z30.D |
(33) 0x411a90 ORR Z28.D, Z30.D, Z30.D |
(33) 0x411a94 ORR Z29.D, Z30.D, Z30.D |
(33) 0x411a98 HINT #0 |
(33) 0x411a9c HINT #0 |
(32) 0x411aa0 LD1W {Z26.S}, P7/Z, [X0, MUL VL] |
(32) 0x411aa4 LD1W {Z27.S}, P7/Z, [X0, #1, MUL VL] |
(32) 0x411aa8 ADD W1, W1, W25 |
(32) 0x411aac FMAXNM Z29.S, P7/M, Z29.S, Z26.S |
(32) 0x411ab0 FMAXNM Z28.S, P7/M, Z28.S, Z27.S |
(32) 0x411ab4 LD1W {Z26.S}, P7/Z, [X0, #2, MUL VL] |
(32) 0x411ab8 LD1W {Z27.S}, P7/Z, [X0, #3, MUL VL] |
(32) 0x411abc FMAXNM Z31.S, P7/M, Z31.S, Z26.S |
(32) 0x411ac0 ADD X0, X0, X26 |
(32) 0x411ac4 FMAXNM Z30.S, P7/M, Z30.S, Z27.S |
(32) 0x411ac8 CMP W23, W1 |
(32) 0x411acc B.CS 411aa0 |
(33) 0x411ad0 FMAXNM Z31.S, P7/M, Z31.S, Z30.S |
(33) 0x411ad4 FMAXNM Z29.S, P7/M, Z29.S, Z28.S |
(33) 0x411ad8 FMAXNM Z31.S, P7/M, Z31.S, Z29.S |
(33) 0x411adc CMP W2, W1 |
(33) 0x411ae0 B.EQ 411b14 |
(34) 0x411ae4 LDR X4, [SP, #168] |
(34) 0x411ae8 ADD X3, X27, W1,UXTW |
(34) 0x411aec MOVZ X0, #0 |
(34) 0x411af0 SUB W1, W2, W1 |
(34) 0x411af4 WHILELO P7.S, WZR, W1 |
(34) 0x411af8 ADD X3, X4, X3,LSL #2 |
(34) 0x411afc HINT #0 |
(29) 0x411b00 LD1W {Z30.S}, P7/Z, [X3, X0,LSL #2] |
(29) 0x411b04 ADD X0, X0, X24 |
(29) 0x411b08 FMAXNM Z31.S, P7/M, Z31.S, Z30.S |
(29) 0x411b0c WHILELO P7.S, W0, W1 |
(29) 0x411b10 B.NE 411b00 |
(33) 0x411b14 MOVI V14.2S, #0 |
(33) 0x411b18 ORR X28, XZR, X20 |
(33) 0x411b1c PTRUE P7.B, ALL |
(33) 0x411b20 FMAXNMV S15, P7, Z31.S |
(30) 0x411b24 LDR S0, [X28], #4 |
(30) 0x411b28 FSUB S0, S0, S15 |
(30) 0x411b2c BL 410140 |
(30) 0x411b30 FADD S14, S14, S0 |
(30) 0x411b34 CMP X28, X22 |
(30) 0x411b38 B.NE 411b24 |
(33) 0x411b3c MOVZ X28, #0 |
(31) 0x411b40 LDR S0, [X20, X28,LSL #2] |
(31) 0x411b44 FSUB S0, S0, S15 |
(31) 0x411b48 BL 410140 |
(31) 0x411b4c FDIV S0, S0, S14 |
(31) 0x411b50 STR S0, [X21, X28,LSL #2] |
(31) 0x411b54 ADD X28, X28, #1 |
(31) 0x411b58 CMP X19, X28 |
(31) 0x411b5c B.NE 411b40 |
(33) 0x411b60 LDR X0, [SP, #128] |
(33) 0x411b64 ADD W23, W23, #1 |
(33) 0x411b68 ADD X3, X19, #1 |
(33) 0x411b6c LDR X28, [SP, #144] |
(33) 0x411b70 ADD X20, X20, X0 |
(33) 0x411b74 ADD X21, X21, X0 |
(33) 0x411b78 LDR X0, [SP, #136] |
(33) 0x411b7c ADD X22, X22, X28 |
(33) 0x411b80 CMP X19, X0 |
(33) 0x411b84 ADD X27, X27, X0 |
(33) 0x411b88 LDR W0, [SP, #164] |
(33) 0x411b8c B.EQ 411bf4 |
(33) 0x411b90 LDR X19, [SP, #112] |
(33) 0x411b94 SUB W2, W0, W3 |
(33) 0x411b98 MOVZ W1, #0 |
(33) 0x411b9c ADD X2, X2, #1 |
(33) 0x411ba0 STR X3, [SP, #120] |
(33) 0x411ba4 UBFM X2, X2, #62, #61 |
(33) 0x411ba8 ORR X0, XZR, X19 |
(33) 0x411bac BL 410060 |
(33) 0x411bb0 LDR X0, [SP, #152] |
(33) 0x411bb4 ADD X1, X19, X28 |
(33) 0x411bb8 PTRUE P7.B, ALL |
(33) 0x411bbc LDR X3, [SP, #120] |
(33) 0x411bc0 STR X1, [SP, #112] |
(33) 0x411bc4 CNTB X1, ALL |
(33) 0x411bc8 ADD W1, W1, #15 |
(33) 0x411bcc SUB X0, X0, #8 |
(33) 0x411bd0 ORR X19, XZR, X3 |
(33) 0x411bd4 STR S14, [X0, X3,LSL #2] |
(33) 0x411bd8 SUB W0, W19, #1 |
(33) 0x411bdc ORR W2, WZR, W19 |
(33) 0x411be0 CMP W0, W1 |
(33) 0x411be4 MOVZ W1, #0 |
(33) 0x411be8 B.CS 411a84 |
(35) 0x411bec DUPM Z31.S, #0xff7fffff |
(35) 0x411bf0 B 411ae4 |
0x411bf4 LDR X1, [SP, #152] |
0x411bf8 SUB W0, W0, #1 |
0x411bfc STR S14, [X1, X0,LSL #2] |
0x411c00 LDP D14, D15, [SP, #96] |
0x411c04 LDP X19, X20, [SP, #16] |
0x411c08 LDP X21, X22, [SP, #32] |
0x411c0c LDP X23, X24, [SP, #48] |
0x411c10 LDP X25, X26, [SP, #64] |
0x411c14 LDP X27, X28, [SP, #80] |
0x411c18 LDP X29, X30, [SP], #176 |
0x411c1c RET |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | attention_v2.cpp:283 | attention-gcc-native |
| ○ | __libc_start_call_main | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | stl_vector.h:1920 | attention-gcc-native |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
The code analyzed by CQA in that panel excludes loops and represents 0.00% of application time for run run_0
| Source file and lines | attention_v2.cpp:42-63 |
| Module | attention-gcc-native |
| nb instructions | 52 |
| nb uops | 52 |
| loop length | 208 |
| used w registers | 7 |
| used x registers | 18 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 2 |
| used d registers | 2 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 0 |
| nb stack references | 22 |
| micro-operation queue | 6.50 cycles |
| front end | 6.50 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 1.50 | 1.50 | 6.75 | 6.75 | 6.75 | 6.75 | 1.00 | 1.00 | 0.00 | 0.00 | 7.83 | 7.50 | 7.67 | 6.50 | 6.50 |
| cycles | 1.50 | 1.50 | 6.75 | 6.75 | 6.75 | 6.75 | 1.00 | 1.00 | 0.00 | 0.00 | 7.83 | 7.50 | 7.67 | 6.50 | 6.50 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 6.50 |
| Dispatch | 7.83 |
| Overall L1 | 7.83 |
| all | 5% |
| load | 14% |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 34% |
| load | 50% |
| store | 35% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 20% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 27% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| STP X29, X30, [SP, #848]! | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| ADD X29, SP, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| STR X1, [SP, #152] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| STR W3, [SP, #164] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (12.5%) |
| STR X0, [SP, #168] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| CMP W3, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| B.LE 411c18 <_Z7softmaxPKfPfS1_i+0x238> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| ORR X4, XZR, X0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADD W0, W3, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| STP X19, X20, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| UBFM X0, X0, #62, #61 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| MOVZ X19, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| PTRUE P7.B, ALL | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (100.0%) |
| CNTB X1, ALL | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | N/A |
| ORR X20, XZR, X4 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| STR X0, [SP, #144] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| SUB X0, X0, #4 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| ADD W1, W1, #15 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| STP X21, X22, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| ORR X21, XZR, X2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADD X22, X4, #4 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| STR X0, [SP, #128] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| SBFM X0, X3, #0, #31 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| STP X23, X24, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| RDVL X23, #63 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| CNTW X24, ALL | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| ADD W23, W23, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| STR X0, [SP, #136] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| ADD X0, X2, #4 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| ORR W2, WZR, W19 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| STP X25, X26, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| CNTB X25, ALL | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| CNTB X26, ALL, MUL #4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| STR X0, [SP, #112] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| SUB W0, W19, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| CMP W0, W1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| STP X27, X28, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| MOVZ W1, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| MOVZ X27, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| STP D14, D15, [SP, #96] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 2 | 0.50 | vect (50.0%) |
| B.CC 411bec <_Z7softmaxPKfPfS1_i+0x20c> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDR X1, [SP, #152] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| SUB W0, W0, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| STR S14, [X1, X0,LSL #2] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 2 | 0.50 | scal (12.5%) |
| LDP D14, D15, [SP, #96] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 | vect (50.0%) |
| LDP X19, X20, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X21, X22, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X23, X24, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X25, X26, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X27, X28, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X29, X30, [SP], #176 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| RET | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
The code analyzed by CQA in that panel excludes loops and represents 0.00% of application time for run run_0
| Source file and lines | attention_v2.cpp:42-63 |
| Module | attention-gcc-native |
| nb instructions | 52 |
| nb uops | 52 |
| loop length | 208 |
| used w registers | 7 |
| used x registers | 18 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 2 |
| used d registers | 2 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 0 |
| nb stack references | 22 |
| micro-operation queue | 6.50 cycles |
| front end | 6.50 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 1.50 | 1.50 | 6.75 | 6.75 | 6.75 | 6.75 | 1.00 | 1.00 | 0.00 | 0.00 | 7.83 | 7.50 | 7.67 | 6.50 | 6.50 |
| cycles | 1.50 | 1.50 | 6.75 | 6.75 | 6.75 | 6.75 | 1.00 | 1.00 | 0.00 | 0.00 | 7.83 | 7.50 | 7.67 | 6.50 | 6.50 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 6.50 |
| Dispatch | 7.83 |
| Overall L1 | 7.83 |
| all | 5% |
| load | 14% |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 34% |
| load | 50% |
| store | 35% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 20% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 27% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| STP X29, X30, [SP, #848]! | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| ADD X29, SP, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| STR X1, [SP, #152] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| STR W3, [SP, #164] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (12.5%) |
| STR X0, [SP, #168] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| CMP W3, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| B.LE 411c18 <_Z7softmaxPKfPfS1_i+0x238> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| ORR X4, XZR, X0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADD W0, W3, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| STP X19, X20, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| UBFM X0, X0, #62, #61 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| MOVZ X19, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| PTRUE P7.B, ALL | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (100.0%) |
| CNTB X1, ALL | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | N/A |
| ORR X20, XZR, X4 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| STR X0, [SP, #144] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| SUB X0, X0, #4 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| ADD W1, W1, #15 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| STP X21, X22, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| ORR X21, XZR, X2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADD X22, X4, #4 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| STR X0, [SP, #128] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| SBFM X0, X3, #0, #31 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| STP X23, X24, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| RDVL X23, #63 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| CNTW X24, ALL | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| ADD W23, W23, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| STR X0, [SP, #136] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| ADD X0, X2, #4 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| ORR W2, WZR, W19 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| STP X25, X26, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| CNTB X25, ALL | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| CNTB X26, ALL, MUL #4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| STR X0, [SP, #112] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| SUB W0, W19, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| CMP W0, W1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| STP X27, X28, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| MOVZ W1, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| MOVZ X27, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| STP D14, D15, [SP, #96] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 2 | 0.50 | vect (50.0%) |
| B.CC 411bec <_Z7softmaxPKfPfS1_i+0x20c> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDR X1, [SP, #152] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| SUB W0, W0, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| STR S14, [X1, X0,LSL #2] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 2 | 0.50 | scal (12.5%) |
| LDP D14, D15, [SP, #96] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 | vect (50.0%) |
| LDP X19, X20, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X21, X22, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X23, X24, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X25, X26, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X27, X28, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X29, X30, [SP], #176 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| RET | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| Name | Coverage (%) | Time (s) |
|---|---|---|
| ▼softmax(float const*, float*, float*, int)– | 0.92 | 0.09 |
| ▼Loop 35 - attention_v2.cpp:44-61 - attention-gcc-native– | 0.00 | 0.00 |
| ▼Loop 34 - attention_v2.cpp:44-61 - attention-gcc-native– | 0.00 | 0.00 |
| ▼Loop 33 - attention_v2.cpp:44-61 - attention-gcc-native– | 0.10 | 0.01 |
| ○Loop 31 - attention_v2.cpp:55-56 - attention-gcc-native | 0.39 | 0.04 |
| ○Loop 30 - attention_v2.cpp:52-53 - attention-gcc-native | 0.29 | 0.03 |
| ○Loop 32 - attention_v2.cpp:47-48 - attention-gcc-native | 0.05 | 0.00 |
| ○Loop 29 - attention_v2.cpp:47-48 - attention-gcc-native | 0.10 | 0.01 |
