| Loop Id: 61 | Module: attention-aocc-znver5-256 | Source: attention_v2.cpp:164-167 [...] | Coverage: 0.25% |
|---|
| Loop Id: 61 | Module: attention-aocc-znver5-256 | Source: attention_v2.cpp:164-167 [...] | Coverage: 0.25% |
|---|
0x4700 VMOVSS -0x3864(%RIP),%XMM1 |
0x4708 VXORPS %XMM0,%XMM0,%XMM0 |
0x470c MOV %R9,%RAX |
0x470f JMP 47cc |
(68) 0x4720 MOV 0x1710(%RSP),%RCX |
(68) 0x4728 MOV 0x398(%RSP),%RSI |
(68) 0x4730 MOV $-0x80000000,%RDX |
(68) 0x4737 XOR %R12D,%R12D |
(68) 0x473a AND %RDX,%RCX |
(68) 0x473d MOV %ESI,%EDX |
(68) 0x473f AND $0x7ffffffe,%EDX |
(68) 0x4745 OR %RCX,%RDX |
(68) 0x4748 MOV %ESI,%ECX |
(68) 0x474a AND $0x1,%ECX |
(68) 0x474d SHR $0x1,%RDX |
(68) 0x4750 XOR 0xff8(%RSP),%RDX |
(68) 0x4758 NEG %ECX |
(68) 0x475a AND %EDI,%ECX |
(68) 0x475c XOR %RDX,%RCX |
(68) 0x475f MOV %RCX,0x1710(%RSP) |
(68) 0x4767 MOV %R12,%RCX |
(68) 0x476a INC %R12 |
(68) 0x476d MOV $0x200b,%EDX |
(68) 0x4772 MOV %R12,0x1718(%RSP) |
(68) 0x477a MOV 0x398(%RSP,%RCX,8),%RCX |
(68) 0x4782 BEXTR %RDX,%RCX,%RDX |
(68) 0x4787 XOR %RCX,%RDX |
(68) 0x478a MOV %EDX,%ECX |
(68) 0x478c SAL $0x7,%ECX |
(68) 0x478f AND $-0x62d3a980,%ECX |
(68) 0x4795 XOR %RDX,%RCX |
(68) 0x4798 MOV %ECX,%EDX |
(68) 0x479a SAL $0xf,%EDX |
(68) 0x479d AND $-0x103a0000,%EDX |
(68) 0x47a3 XOR %RCX,%RDX |
(68) 0x47a6 MOV %RDX,%RCX |
(68) 0x47a9 SHR $0x12,%RCX |
(68) 0x47ad XOR %RDX,%RCX |
(68) 0x47b0 DEC %RAX |
(68) 0x47b3 VCVTUSI2SS %RCX,%XMM5,%XMM2 |
(68) 0x47b9 VFMADD231SS %XMM2,%XMM1,%XMM0 |
(68) 0x47be VMULSS -0x3926(%RIP),%XMM1,%XMM1 |
(68) 0x47c6 JE 4980 |
(68) 0x47cc CMP $0x270,%R12 |
(68) 0x47d3 JB 4767 |
(68) 0x47d5 VPBROADCASTQ %RSI,%YMM2 |
(68) 0x47db XOR %ECX,%ECX |
(68) 0x47dd NOPL (%RAX) |
(69) 0x47e0 VMOVDQA %YMM2,%YMM3 |
(69) 0x47e4 VMOVDQU 0x3a0(%RSP,%RCX,8),%YMM2 |
(69) 0x47ed VPANDQ -0x3937(%RIP){1to4},%YMM2,%YMM4 |
(69) 0x47f7 VPTESTMQ -0x3921(%RIP){1to0},%YMM2,%K1 |
(69) 0x4801 VALIGNQ $0x3,%YMM3,%YMM2,%YMM3 |
(69) 0x4808 VPTERNLOGQ $-0x8,-0x392b(%RIP){1to4},%YMM3,%YMM4 |
(69) 0x4813 VPSRLQ $0x1,%YMM4,%YMM3 |
(69) 0x4818 VPXOR 0x1000(%RSP,%RCX,8),%YMM3,%YMM3 |
(69) 0x4821 VPXORQ -0x393b(%RIP){1to4},%YMM3,%YMM3{%K1} |
(69) 0x482b VMOVDQU %YMM3,0x398(%RSP,%RCX,8) |
(69) 0x4834 ADD $0x4,%RCX |
(69) 0x4838 CMP $0xe0,%RCX |
(69) 0x483f JNE 47e0 |
(68) 0x4841 MOV 0xaa0(%RSP),%RDX |
(68) 0x4849 VEXTRACTI128 $0x1,%YMM2,%XMM2 |
(68) 0x484f MOV 0xaa8(%RSP),%RCX |
(68) 0x4857 MOV $-0x66f74f21,%R8D |
(68) 0x485d VPEXTRQ $0x1,%XMM2,%RSI |
(68) 0x4863 AND $-0x80000000,%RSI |
(68) 0x486a MOV %EDX,%EDI |
(68) 0x486c AND $0x7ffffffe,%EDI |
(68) 0x4872 OR %RSI,%RDI |
(68) 0x4875 MOV %EDX,%ESI |
(68) 0x4877 AND $0x1,%ESI |
(68) 0x487a AND $-0x80000000,%RDX |
(68) 0x4881 SHR $0x1,%RDI |
(68) 0x4884 XOR 0x1700(%RSP),%RDI |
(68) 0x488c NEG %ESI |
(68) 0x488e AND %R8D,%ESI |
(68) 0x4891 XOR %RDI,%RSI |
(68) 0x4894 MOV $-0x66f74f21,%EDI |
(68) 0x4899 MOV %RSI,0xa98(%RSP) |
(68) 0x48a1 MOV %ECX,%ESI |
(68) 0x48a3 AND $0x7ffffffe,%ESI |
(68) 0x48a9 OR %RDX,%RSI |
(68) 0x48ac MOV %ECX,%EDX |
(68) 0x48ae AND $0x1,%EDX |
(68) 0x48b1 AND $-0x80000000,%RCX |
(68) 0x48b8 SHR $0x1,%RSI |
(68) 0x48bb XOR 0x1708(%RSP),%RSI |
(68) 0x48c3 NEG %EDX |
(68) 0x48c5 AND %R8D,%EDX |
(68) 0x48c8 XOR %RSI,%RDX |
(68) 0x48cb MOV %RDX,0xaa0(%RSP) |
(68) 0x48d3 MOV 0xab0(%RSP),%RDX |
(68) 0x48db MOV %EDX,%ESI |
(68) 0x48dd VPBROADCASTQ %RDX,%XMM2 |
(68) 0x48e3 AND $0x7ffffffe,%EDX |
(68) 0x48e9 AND $0x1,%ESI |
(68) 0x48ec OR %RCX,%RDX |
(68) 0x48ef NEG %ESI |
(68) 0x48f1 MOV $0xe4,%ECX |
(68) 0x48f6 SHR $0x1,%RDX |
(68) 0x48f9 XOR 0x1710(%RSP),%RDX |
(68) 0x4901 AND %R8D,%ESI |
(68) 0x4904 XOR %RDX,%RSI |
(68) 0x4907 MOV %RSI,0xaa8(%RSP) |
(68) 0x490f NOP |
(70) 0x4910 VMOVDQU 0x398(%RSP,%RCX,8),%XMM3 |
(70) 0x4919 VPANDQ -0x3a63(%RIP){1to2},%XMM3,%XMM4 |
(70) 0x4923 VPTESTMQ -0x3a4d(%RIP){1to0},%XMM3,%K1 |
(70) 0x492d VPALIGNR $0x8,%XMM2,%XMM3,%XMM2 |
(70) 0x4933 VPTERNLOGQ $-0x8,-0x3a56(%RIP){1to2},%XMM2,%XMM4 |
(70) 0x493e VPSRLQ $0x1,%XMM4,%XMM2 |
(70) 0x4943 VPXOR -0x388(%RSP,%RCX,8),%XMM2,%XMM2 |
(70) 0x494c VPXORQ -0x3a66(%RIP){1to2},%XMM2,%XMM2{%K1} |
(70) 0x4956 VMOVDQU %XMM2,0x390(%RSP,%RCX,8) |
(70) 0x495f ADD $0x2,%RCX |
(70) 0x4963 VMOVDQA %XMM3,%XMM2 |
(70) 0x4967 CMP $0x270,%RCX |
(70) 0x496e JNE 4910 |
(68) 0x4970 JMP 4720 |
0x4980 VDIVSS %XMM1,%XMM0,%XMM0 |
0x4984 VMOVSS -0x3ae8(%RIP),%XMM2 |
0x498c VUCOMISS %XMM2,%XMM0 |
0x4990 JAE 4ef6 |
0x4996 MOV 0x30(%RSP),%RAX |
0x499b VMOVAPS %XMM2,%XMM1 |
0x499f VMOVSS %XMM0,(%RAX,%R10,4) |
0x49a5 VXORPS %XMM0,%XMM0,%XMM0 |
0x49a9 MOV %R9,%RAX |
0x49ac JMP 4a6c |
(65) 0x49c0 MOV 0x1710(%RSP),%RCX |
(65) 0x49c8 MOV 0x398(%RSP),%RSI |
(65) 0x49d0 MOV $-0x80000000,%RDX |
(65) 0x49d7 XOR %R12D,%R12D |
(65) 0x49da AND %RDX,%RCX |
(65) 0x49dd MOV %ESI,%EDX |
(65) 0x49df AND $0x7ffffffe,%EDX |
(65) 0x49e5 OR %RCX,%RDX |
(65) 0x49e8 MOV %ESI,%ECX |
(65) 0x49ea AND $0x1,%ECX |
(65) 0x49ed SHR $0x1,%RDX |
(65) 0x49f0 XOR 0xff8(%RSP),%RDX |
(65) 0x49f8 NEG %ECX |
(65) 0x49fa AND %EDI,%ECX |
(65) 0x49fc XOR %RDX,%RCX |
(65) 0x49ff MOV %RCX,0x1710(%RSP) |
(65) 0x4a07 MOV %R12,%RCX |
(65) 0x4a0a INC %R12 |
(65) 0x4a0d MOV $0x200b,%EDX |
(65) 0x4a12 MOV %R12,0x1718(%RSP) |
(65) 0x4a1a MOV 0x398(%RSP,%RCX,8),%RCX |
(65) 0x4a22 BEXTR %RDX,%RCX,%RDX |
(65) 0x4a27 XOR %RCX,%RDX |
(65) 0x4a2a MOV %EDX,%ECX |
(65) 0x4a2c SAL $0x7,%ECX |
(65) 0x4a2f AND $-0x62d3a980,%ECX |
(65) 0x4a35 XOR %RDX,%RCX |
(65) 0x4a38 MOV %ECX,%EDX |
(65) 0x4a3a SAL $0xf,%EDX |
(65) 0x4a3d AND $-0x103a0000,%EDX |
(65) 0x4a43 XOR %RCX,%RDX |
(65) 0x4a46 MOV %RDX,%RCX |
(65) 0x4a49 SHR $0x12,%RCX |
(65) 0x4a4d XOR %RDX,%RCX |
(65) 0x4a50 DEC %RAX |
(65) 0x4a53 VCVTUSI2SS %RCX,%XMM5,%XMM2 |
(65) 0x4a59 VFMADD231SS %XMM2,%XMM1,%XMM0 |
(65) 0x4a5e VMULSS -0x3bc6(%RIP),%XMM1,%XMM1 |
(65) 0x4a66 JE 4c20 |
(65) 0x4a6c CMP $0x270,%R12 |
(65) 0x4a73 JB 4a07 |
(65) 0x4a75 VPBROADCASTQ %RSI,%YMM2 |
(65) 0x4a7b XOR %ECX,%ECX |
(65) 0x4a7d NOPL (%RAX) |
(66) 0x4a80 VMOVDQA %YMM2,%YMM3 |
(66) 0x4a84 VMOVDQU 0x3a0(%RSP,%RCX,8),%YMM2 |
(66) 0x4a8d VPANDQ -0x3bd7(%RIP){1to4},%YMM2,%YMM4 |
(66) 0x4a97 VPTESTMQ -0x3bc1(%RIP){1to0},%YMM2,%K1 |
(66) 0x4aa1 VALIGNQ $0x3,%YMM3,%YMM2,%YMM3 |
(66) 0x4aa8 VPTERNLOGQ $-0x8,-0x3bcb(%RIP){1to4},%YMM3,%YMM4 |
(66) 0x4ab3 VPSRLQ $0x1,%YMM4,%YMM3 |
(66) 0x4ab8 VPXOR 0x1000(%RSP,%RCX,8),%YMM3,%YMM3 |
(66) 0x4ac1 VPXORQ -0x3bdb(%RIP){1to4},%YMM3,%YMM3{%K1} |
(66) 0x4acb VMOVDQU %YMM3,0x398(%RSP,%RCX,8) |
(66) 0x4ad4 ADD $0x4,%RCX |
(66) 0x4ad8 CMP $0xe0,%RCX |
(66) 0x4adf JNE 4a80 |
(65) 0x4ae1 MOV 0xaa0(%RSP),%RDX |
(65) 0x4ae9 VEXTRACTI128 $0x1,%YMM2,%XMM2 |
(65) 0x4aef MOV 0xaa8(%RSP),%RCX |
(65) 0x4af7 MOV $-0x66f74f21,%R8D |
(65) 0x4afd VPEXTRQ $0x1,%XMM2,%RSI |
(65) 0x4b03 AND $-0x80000000,%RSI |
(65) 0x4b0a MOV %EDX,%EDI |
(65) 0x4b0c AND $0x7ffffffe,%EDI |
(65) 0x4b12 OR %RSI,%RDI |
(65) 0x4b15 MOV %EDX,%ESI |
(65) 0x4b17 AND $0x1,%ESI |
(65) 0x4b1a AND $-0x80000000,%RDX |
(65) 0x4b21 SHR $0x1,%RDI |
(65) 0x4b24 XOR 0x1700(%RSP),%RDI |
(65) 0x4b2c NEG %ESI |
(65) 0x4b2e AND %R8D,%ESI |
(65) 0x4b31 XOR %RDI,%RSI |
(65) 0x4b34 MOV $-0x66f74f21,%EDI |
(65) 0x4b39 MOV %RSI,0xa98(%RSP) |
(65) 0x4b41 MOV %ECX,%ESI |
(65) 0x4b43 AND $0x7ffffffe,%ESI |
(65) 0x4b49 OR %RDX,%RSI |
(65) 0x4b4c MOV %ECX,%EDX |
(65) 0x4b4e AND $0x1,%EDX |
(65) 0x4b51 AND $-0x80000000,%RCX |
(65) 0x4b58 SHR $0x1,%RSI |
(65) 0x4b5b XOR 0x1708(%RSP),%RSI |
(65) 0x4b63 NEG %EDX |
(65) 0x4b65 AND %R8D,%EDX |
(65) 0x4b68 XOR %RSI,%RDX |
(65) 0x4b6b MOV %RDX,0xaa0(%RSP) |
(65) 0x4b73 MOV 0xab0(%RSP),%RDX |
(65) 0x4b7b MOV %EDX,%ESI |
(65) 0x4b7d VPBROADCASTQ %RDX,%XMM2 |
(65) 0x4b83 AND $0x7ffffffe,%EDX |
(65) 0x4b89 AND $0x1,%ESI |
(65) 0x4b8c OR %RCX,%RDX |
(65) 0x4b8f NEG %ESI |
(65) 0x4b91 MOV $0xe4,%ECX |
(65) 0x4b96 SHR $0x1,%RDX |
(65) 0x4b99 XOR 0x1710(%RSP),%RDX |
(65) 0x4ba1 AND %R8D,%ESI |
(65) 0x4ba4 XOR %RDX,%RSI |
(65) 0x4ba7 MOV %RSI,0xaa8(%RSP) |
(65) 0x4baf NOP |
(67) 0x4bb0 VMOVDQU 0x398(%RSP,%RCX,8),%XMM3 |
(67) 0x4bb9 VPANDQ -0x3d03(%RIP){1to2},%XMM3,%XMM4 |
(67) 0x4bc3 VPTESTMQ -0x3ced(%RIP){1to0},%XMM3,%K1 |
(67) 0x4bcd VPALIGNR $0x8,%XMM2,%XMM3,%XMM2 |
(67) 0x4bd3 VPTERNLOGQ $-0x8,-0x3cf6(%RIP){1to2},%XMM2,%XMM4 |
(67) 0x4bde VPSRLQ $0x1,%XMM4,%XMM2 |
(67) 0x4be3 VPXOR -0x388(%RSP,%RCX,8),%XMM2,%XMM2 |
(67) 0x4bec VPXORQ -0x3d06(%RIP){1to2},%XMM2,%XMM2{%K1} |
(67) 0x4bf6 VMOVDQU %XMM2,0x390(%RSP,%RCX,8) |
(67) 0x4bff ADD $0x2,%RCX |
(67) 0x4c03 VMOVDQA %XMM3,%XMM2 |
(67) 0x4c07 CMP $0x270,%RCX |
(67) 0x4c0e JNE 4bb0 |
(65) 0x4c10 JMP 49c0 |
0x4c20 VDIVSS %XMM1,%XMM0,%XMM0 |
0x4c24 VMOVSS -0x3d88(%RIP),%XMM2 |
0x4c2c VUCOMISS %XMM2,%XMM0 |
0x4c30 JAE 4f3d |
0x4c36 MOV 0x38(%RSP),%RAX |
0x4c3b VMOVAPS %XMM2,%XMM1 |
0x4c3f VMOVSS %XMM0,(%RAX,%R10,4) |
0x4c45 VXORPS %XMM0,%XMM0,%XMM0 |
0x4c49 MOV %R9,%RAX |
0x4c4c JMP 4d0c |
(62) 0x4c60 MOV 0x1710(%RSP),%RCX |
(62) 0x4c68 MOV 0x398(%RSP),%RSI |
(62) 0x4c70 MOV $-0x80000000,%RDX |
(62) 0x4c77 XOR %R12D,%R12D |
(62) 0x4c7a AND %RDX,%RCX |
(62) 0x4c7d MOV %ESI,%EDX |
(62) 0x4c7f AND $0x7ffffffe,%EDX |
(62) 0x4c85 OR %RCX,%RDX |
(62) 0x4c88 MOV %ESI,%ECX |
(62) 0x4c8a AND $0x1,%ECX |
(62) 0x4c8d SHR $0x1,%RDX |
(62) 0x4c90 XOR 0xff8(%RSP),%RDX |
(62) 0x4c98 NEG %ECX |
(62) 0x4c9a AND %EDI,%ECX |
(62) 0x4c9c XOR %RDX,%RCX |
(62) 0x4c9f MOV %RCX,0x1710(%RSP) |
(62) 0x4ca7 MOV %R12,%RCX |
(62) 0x4caa INC %R12 |
(62) 0x4cad MOV $0x200b,%EDX |
(62) 0x4cb2 MOV %R12,0x1718(%RSP) |
(62) 0x4cba MOV 0x398(%RSP,%RCX,8),%RCX |
(62) 0x4cc2 BEXTR %RDX,%RCX,%RDX |
(62) 0x4cc7 XOR %RCX,%RDX |
(62) 0x4cca MOV %EDX,%ECX |
(62) 0x4ccc SAL $0x7,%ECX |
(62) 0x4ccf AND $-0x62d3a980,%ECX |
(62) 0x4cd5 XOR %RDX,%RCX |
(62) 0x4cd8 MOV %ECX,%EDX |
(62) 0x4cda SAL $0xf,%EDX |
(62) 0x4cdd AND $-0x103a0000,%EDX |
(62) 0x4ce3 XOR %RCX,%RDX |
(62) 0x4ce6 MOV %RDX,%RCX |
(62) 0x4ce9 SHR $0x12,%RCX |
(62) 0x4ced XOR %RDX,%RCX |
(62) 0x4cf0 DEC %RAX |
(62) 0x4cf3 VCVTUSI2SS %RCX,%XMM5,%XMM2 |
(62) 0x4cf9 VFMADD231SS %XMM2,%XMM1,%XMM0 |
(62) 0x4cfe VMULSS -0x3e66(%RIP),%XMM1,%XMM1 |
(62) 0x4d06 JE 4ec0 |
(62) 0x4d0c CMP $0x270,%R12 |
(62) 0x4d13 JB 4ca7 |
(62) 0x4d15 VPBROADCASTQ %RSI,%YMM2 |
(62) 0x4d1b XOR %ECX,%ECX |
(62) 0x4d1d NOPL (%RAX) |
(63) 0x4d20 VMOVDQA %YMM2,%YMM3 |
(63) 0x4d24 VMOVDQU 0x3a0(%RSP,%RCX,8),%YMM2 |
(63) 0x4d2d VPANDQ -0x3e77(%RIP){1to4},%YMM2,%YMM4 |
(63) 0x4d37 VPTESTMQ -0x3e61(%RIP){1to0},%YMM2,%K1 |
(63) 0x4d41 VALIGNQ $0x3,%YMM3,%YMM2,%YMM3 |
(63) 0x4d48 VPTERNLOGQ $-0x8,-0x3e6b(%RIP){1to4},%YMM3,%YMM4 |
(63) 0x4d53 VPSRLQ $0x1,%YMM4,%YMM3 |
(63) 0x4d58 VPXOR 0x1000(%RSP,%RCX,8),%YMM3,%YMM3 |
(63) 0x4d61 VPXORQ -0x3e7b(%RIP){1to4},%YMM3,%YMM3{%K1} |
(63) 0x4d6b VMOVDQU %YMM3,0x398(%RSP,%RCX,8) |
(63) 0x4d74 ADD $0x4,%RCX |
(63) 0x4d78 CMP $0xe0,%RCX |
(63) 0x4d7f JNE 4d20 |
(62) 0x4d81 MOV 0xaa0(%RSP),%RDX |
(62) 0x4d89 VEXTRACTI128 $0x1,%YMM2,%XMM2 |
(62) 0x4d8f MOV 0xaa8(%RSP),%RCX |
(62) 0x4d97 MOV $-0x66f74f21,%R8D |
(62) 0x4d9d VPEXTRQ $0x1,%XMM2,%RSI |
(62) 0x4da3 AND $-0x80000000,%RSI |
(62) 0x4daa MOV %EDX,%EDI |
(62) 0x4dac AND $0x7ffffffe,%EDI |
(62) 0x4db2 OR %RSI,%RDI |
(62) 0x4db5 MOV %EDX,%ESI |
(62) 0x4db7 AND $0x1,%ESI |
(62) 0x4dba AND $-0x80000000,%RDX |
(62) 0x4dc1 SHR $0x1,%RDI |
(62) 0x4dc4 XOR 0x1700(%RSP),%RDI |
(62) 0x4dcc NEG %ESI |
(62) 0x4dce AND %R8D,%ESI |
(62) 0x4dd1 XOR %RDI,%RSI |
(62) 0x4dd4 MOV $-0x66f74f21,%EDI |
(62) 0x4dd9 MOV %RSI,0xa98(%RSP) |
(62) 0x4de1 MOV %ECX,%ESI |
(62) 0x4de3 AND $0x7ffffffe,%ESI |
(62) 0x4de9 OR %RDX,%RSI |
(62) 0x4dec MOV %ECX,%EDX |
(62) 0x4dee AND $0x1,%EDX |
(62) 0x4df1 AND $-0x80000000,%RCX |
(62) 0x4df8 SHR $0x1,%RSI |
(62) 0x4dfb XOR 0x1708(%RSP),%RSI |
(62) 0x4e03 NEG %EDX |
(62) 0x4e05 AND %R8D,%EDX |
(62) 0x4e08 XOR %RSI,%RDX |
(62) 0x4e0b MOV %RDX,0xaa0(%RSP) |
(62) 0x4e13 MOV 0xab0(%RSP),%RDX |
(62) 0x4e1b MOV %EDX,%ESI |
(62) 0x4e1d VPBROADCASTQ %RDX,%XMM2 |
(62) 0x4e23 AND $0x7ffffffe,%EDX |
(62) 0x4e29 AND $0x1,%ESI |
(62) 0x4e2c OR %RCX,%RDX |
(62) 0x4e2f NEG %ESI |
(62) 0x4e31 MOV $0xe4,%ECX |
(62) 0x4e36 SHR $0x1,%RDX |
(62) 0x4e39 XOR 0x1710(%RSP),%RDX |
(62) 0x4e41 AND %R8D,%ESI |
(62) 0x4e44 XOR %RDX,%RSI |
(62) 0x4e47 MOV %RSI,0xaa8(%RSP) |
(62) 0x4e4f NOP |
(64) 0x4e50 VMOVDQU 0x398(%RSP,%RCX,8),%XMM3 |
(64) 0x4e59 VPANDQ -0x3fa3(%RIP){1to2},%XMM3,%XMM4 |
(64) 0x4e63 VPTESTMQ -0x3f8d(%RIP){1to0},%XMM3,%K1 |
(64) 0x4e6d VPALIGNR $0x8,%XMM2,%XMM3,%XMM2 |
(64) 0x4e73 VPTERNLOGQ $-0x8,-0x3f96(%RIP){1to2},%XMM2,%XMM4 |
(64) 0x4e7e VPSRLQ $0x1,%XMM4,%XMM2 |
(64) 0x4e83 VPXOR -0x388(%RSP,%RCX,8),%XMM2,%XMM2 |
(64) 0x4e8c VPXORQ -0x3fa6(%RIP){1to2},%XMM2,%XMM2{%K1} |
(64) 0x4e96 VMOVDQU %XMM2,0x390(%RSP,%RCX,8) |
(64) 0x4e9f ADD $0x2,%RCX |
(64) 0x4ea3 VMOVDQA %XMM3,%XMM2 |
(64) 0x4ea7 CMP $0x270,%RCX |
(64) 0x4eae JNE 4e50 |
(62) 0x4eb0 JMP 4c60 |
0x4ec0 VDIVSS %XMM1,%XMM0,%XMM0 |
0x4ec4 VUCOMISS -0x4028(%RIP),%XMM0 |
0x4ecc JAE 4f84 |
0x4ed2 MOV 0x110(%RSP),%RAX |
0x4eda VMOVSS %XMM0,(%RAX,%R10,4) |
0x4ee0 INC %R10 |
0x4ee3 CMP 0x2b8(%RSP),%R10 |
0x4eeb JNE 4700 |
0x4ef6 VXORPS %XMM1,%XMM1,%XMM1 |
0x4efa VMOVAPS %XMM2,%XMM0 |
0x4efe MOV %R9,0xe0(%RSP) |
0x4f06 MOV %R10,0x120(%RSP) |
0x4f0e MOV %RSI,0x20(%RSP) |
0x4f13 VZEROUPPER |
0x4f16 CALL 70a0 <@plt_start@+0xf0> |
0x4f1b VMOVSS -0x407f(%RIP),%XMM2 |
0x4f23 MOV 0x20(%RSP),%RSI |
0x4f28 MOV 0x120(%RSP),%R10 |
0x4f30 MOV 0xe0(%RSP),%R9 |
0x4f38 JMP 4996 |
0x4f3d VXORPS %XMM1,%XMM1,%XMM1 |
0x4f41 VMOVAPS %XMM2,%XMM0 |
0x4f45 MOV %R9,0xe0(%RSP) |
0x4f4d MOV %R10,0x120(%RSP) |
0x4f55 MOV %RSI,0x20(%RSP) |
0x4f5a VZEROUPPER |
0x4f5d CALL 70a0 <@plt_start@+0xf0> |
0x4f62 VMOVSS -0x40c6(%RIP),%XMM2 |
0x4f6a MOV 0x20(%RSP),%RSI |
0x4f6f MOV 0x120(%RSP),%R10 |
0x4f77 MOV 0xe0(%RSP),%R9 |
0x4f7f JMP 4c36 |
0x4f84 VMOVSS -0x40e8(%RIP),%XMM0 |
0x4f8c VXORPS %XMM1,%XMM1,%XMM1 |
0x4f90 MOV %R9,0xe0(%RSP) |
0x4f98 MOV %R10,0x120(%RSP) |
0x4fa0 MOV %RSI,0x20(%RSP) |
0x4fa5 VZEROUPPER |
0x4fa8 CALL 70a0 <@plt_start@+0xf0> |
0x4fad MOV 0x20(%RSP),%RSI |
0x4fb2 MOV 0x120(%RSP),%R10 |
0x4fba MOV 0xe0(%RSP),%R9 |
0x4fc2 JMP 4ed2 |
/usr/lib/gcc/x86_64-redhat-linux/11/../../../../include/c++/11/cmath: 1661 - 1661 |
-------------------------------------------------------------------------------- |
1661: { return __builtin_nextafterf(__x, __y); } |
/usr/lib/gcc/x86_64-redhat-linux/11/../../../../include/c++/11/bits/random.tcc: 401 - 3370 |
-------------------------------------------------------------------------------- |
401: for (size_t __k = 0; __k < (__n - __m); ++__k) |
402: { |
403: _UIntType __y = ((_M_x[__k] & __upper_mask) |
404: | (_M_x[__k + 1] & __lower_mask)); |
405: _M_x[__k] = (_M_x[__k + __m] ^ (__y >> 1) |
406: ^ ((__y & 0x01) ? __a : 0)); |
[...] |
412: | (_M_x[__k + 1] & __lower_mask)); |
413: _M_x[__k] = (_M_x[__k + (__m - __n)] ^ (__y >> 1) |
414: ^ ((__y & 0x01) ? __a : 0)); |
415: } |
416: |
417: _UIntType __y = ((_M_x[__n - 1] & __upper_mask) |
418: | (_M_x[0] & __lower_mask)); |
419: _M_x[__n - 1] = (_M_x[__m - 1] ^ (__y >> 1) |
420: ^ ((__y & 0x01) ? __a : 0)); |
[...] |
455: if (_M_p >= state_size) |
456: _M_gen_rand(); |
457: |
458: // Calculate o(x(i)). |
459: result_type __z = _M_x[_M_p++]; |
460: __z ^= (__z >> __u) & __d; |
461: __z ^= (__z << __s) & __b; |
462: __z ^= (__z << __t) & __c; |
463: __z ^= (__z >> __l); |
[...] |
3364: for (size_t __k = __m; __k != 0; --__k) |
3365: { |
3366: __sum += _RealType(__urng() - __urng.min()) * __tmp; |
3367: __tmp *= __r; |
3368: } |
3369: __ret = __sum / __tmp; |
3370: if (__builtin_expect(__ret >= _RealType(1), 0)) |
/home/eoseret/llm-attention/attention_v2.cpp: 164 - 167 |
-------------------------------------------------------------------------------- |
164: for (size_t i = 0; i < elemsW; ++i) { |
165: h_WQ[i] = dist(rng); |
166: h_WK[i] = dist(rng); |
167: h_WV[i] = dist(rng); |
| Coverage (%) | Name | Source Location | Module |
|---|
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.22 |
| CQA speedup if FP arith vectorized | 1.43 |
| CQA speedup if fully vectorized | 11.78 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.14 |
| Bottlenecks | micro-operation queue, |
| Function | main |
| Source | cmath:1661-1661,random.tcc:3369-3370,attention_v2.cpp:164-167 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 9.13 |
| CQA cycles if no scalar integer | 7.50 |
| CQA cycles if FP arith vectorized | 6.36 |
| CQA cycles if fully vectorized | 0.77 |
| Front-end cycles | 9.13 |
| P0 cycles | 0.67 |
| P1 cycles | 0.67 |
| P2 cycles | 0.67 |
| P3 cycles | 3.33 |
| P4 cycles | 3.33 |
| P5 cycles | 3.33 |
| P6 cycles | 8.00 |
| P7 cycles | 8.00 |
| P8 cycles | 8.00 |
| P9 cycles | 8.00 |
| P10 cycles | 1.50 |
| P11 cycles | 1.50 |
| P12 cycles | 1.50 |
| P13 cycles | 1.50 |
| P14 cycles | 3.00 |
| P15 cycles | 3.00 |
| DIV/SQRT cycles | 7.50 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 67.00 |
| Nb uops | 73.00 |
| Nb loads | 20.00 |
| Nb stores | 12.00 |
| Nb stack references | 7.00 |
| FLOP/cycle | 0.33 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 3.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 23.67 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 132.00 |
| Bytes stored | 84.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 30.23 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | NA |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | 0.00 |
| Vectorization ratio other | 81.25 |
| Vector-efficiency ratio all | 14.10 |
| Vector-efficiency ratio load | 9.13 |
| Vector-efficiency ratio store | 10.94 |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | NA |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | 6.25 |
| Vector-efficiency ratio other | 21.48 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.22 |
| CQA speedup if FP arith vectorized | 1.43 |
| CQA speedup if fully vectorized | 11.78 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.14 |
| Bottlenecks | micro-operation queue, |
| Function | main |
| Source | cmath:1661-1661,random.tcc:3369-3370,attention_v2.cpp:164-167 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 9.13 |
| CQA cycles if no scalar integer | 7.50 |
| CQA cycles if FP arith vectorized | 6.36 |
| CQA cycles if fully vectorized | 0.77 |
| Front-end cycles | 9.13 |
| P0 cycles | 0.67 |
| P1 cycles | 0.67 |
| P2 cycles | 0.67 |
| P3 cycles | 3.33 |
| P4 cycles | 3.33 |
| P5 cycles | 3.33 |
| P6 cycles | 8.00 |
| P7 cycles | 8.00 |
| P8 cycles | 8.00 |
| P9 cycles | 8.00 |
| P10 cycles | 1.50 |
| P11 cycles | 1.50 |
| P12 cycles | 1.50 |
| P13 cycles | 1.50 |
| P14 cycles | 3.00 |
| P15 cycles | 3.00 |
| DIV/SQRT cycles | 7.50 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 67.00 |
| Nb uops | 73.00 |
| Nb loads | 20.00 |
| Nb stores | 12.00 |
| Nb stack references | 7.00 |
| FLOP/cycle | 0.33 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 3.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 23.67 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 132.00 |
| Bytes stored | 84.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 30.23 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | NA |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | 0.00 |
| Vectorization ratio other | 81.25 |
| Vector-efficiency ratio all | 14.10 |
| Vector-efficiency ratio load | 9.13 |
| Vector-efficiency ratio store | 10.94 |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | NA |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | 6.25 |
| Vector-efficiency ratio other | 21.48 |
| Path / |
| Function | main |
| Source file and lines | attention_v2.cpp:164-167 |
| Module | attention-aocc-znver5-256 |
| nb instructions | 67 |
| nb uops | 73 |
| loop length | 376 |
| used x86 registers | 5 |
| used mmx registers | 0 |
| used xmm registers | 3 |
| used ymm registers | 0 |
| used zmm registers | 0 |
| nb stack references | 7 |
| micro-operation queue | 9.13 cycles |
| front end | 9.13 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 0.67 | 0.67 | 0.67 | 3.33 | 3.33 | 3.33 | 8.00 | 8.00 | 8.00 | 8.00 | 1.50 | 1.50 | 1.50 | 1.50 | 3.00 | 3.00 |
| cycles | 0.67 | 0.67 | 0.67 | 3.33 | 3.33 | 3.33 | 8.00 | 8.00 | 8.00 | 8.00 | 1.50 | 1.50 | 1.50 | 1.50 | 3.00 | 3.00 |
| Cycles executing div or sqrt instructions | 7.50 |
| Front-end | 9.13 |
| Dispatch | 8.00 |
| DIV/SQRT | 7.50 |
| Overall L1 | 9.13 |
| all | 16% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 100% |
| all | 40% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 0% |
| other | 76% |
| all | 30% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 0% |
| other | 81% |
| all | 14% |
| load | 12% |
| store | 12% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 25% |
| all | 13% |
| load | 6% |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 6% |
| other | 20% |
| all | 14% |
| load | 9% |
| store | 10% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 6% |
| other | 21% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| VMOVSS -0x3864(%RIP),%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VXORPS %XMM0,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| MOV %R9,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.13 | N/A |
| JMP 47cc <main+0xbbc> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| VDIVSS %XMM1,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 10 | 2.50 | scal (6.3%) |
| VMOVSS -0x3ae8(%RIP),%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VUCOMISS %XMM2,%XMM0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0.50 | 7 | 0.50 | scal (6.3%) |
| JAE 4ef6 <main+0x12e6> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-0.50 | N/A |
| MOV 0x30(%RSP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | N/A |
| VMOVAPS %XMM2,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| VMOVSS %XMM0,(%RAX,%R10,4) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (6.3%) |
| VXORPS %XMM0,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| MOV %R9,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.13 | N/A |
| JMP 4a6c <main+0xe5c> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| VDIVSS %XMM1,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 10 | 2.50 | scal (6.3%) |
| VMOVSS -0x3d88(%RIP),%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VUCOMISS %XMM2,%XMM0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0.50 | 7 | 0.50 | scal (6.3%) |
| JAE 4f3d <main+0x132d> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-0.50 | N/A |
| MOV 0x38(%RSP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | N/A |
| VMOVAPS %XMM2,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| VMOVSS %XMM0,(%RAX,%R10,4) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (6.3%) |
| VXORPS %XMM0,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| MOV %R9,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.13 | N/A |
| JMP 4d0c <main+0x10fc> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| VDIVSS %XMM1,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 10 | 2.50 | scal (6.3%) |
| VUCOMISS -0x4028(%RIP),%XMM0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0.50 | 7 | 0.50 | scal (6.3%) |
| JAE 4f84 <main+0x1374> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-0.50 | N/A |
| MOV 0x110(%RSP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | N/A |
| VMOVSS %XMM0,(%RAX,%R10,4) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (6.3%) |
| INC %R10 | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| CMP 0x2b8(%RSP),%R10 | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| JNE 4700 <main+0xaf0> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-0.50 | N/A |
| VXORPS %XMM1,%XMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| VMOVAPS %XMM2,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| MOV %R9,0xe0(%RSP) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| MOV %R10,0x120(%RSP) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| MOV %RSI,0x20(%RSP) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| VZEROUPPER | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | vect (25.0%) |
| CALL 70a0 <@plt_start@+0xf0> | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | N/A |
| VMOVSS -0x407f(%RIP),%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| MOV 0x20(%RSP),%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (12.5%) |
| MOV 0x120(%RSP),%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | N/A |
| MOV 0xe0(%RSP),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (12.5%) |
| JMP 4996 <main+0xd86> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| VXORPS %XMM1,%XMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| VMOVAPS %XMM2,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| MOV %R9,0xe0(%RSP) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| MOV %R10,0x120(%RSP) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| MOV %RSI,0x20(%RSP) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| VZEROUPPER | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | vect (25.0%) |
| CALL 70a0 <@plt_start@+0xf0> | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | N/A |
| VMOVSS -0x40c6(%RIP),%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| MOV 0x20(%RSP),%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (12.5%) |
| MOV 0x120(%RSP),%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | N/A |
| MOV 0xe0(%RSP),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (12.5%) |
| JMP 4c36 <main+0x1026> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| VMOVSS -0x40e8(%RIP),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VXORPS %XMM1,%XMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| MOV %R9,0xe0(%RSP) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| MOV %R10,0x120(%RSP) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| MOV %RSI,0x20(%RSP) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| VZEROUPPER | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | vect (25.0%) |
| CALL 70a0 <@plt_start@+0xf0> | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | N/A |
| MOV 0x20(%RSP),%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (12.5%) |
| MOV 0x120(%RSP),%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | N/A |
| MOV 0xe0(%RSP),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (12.5%) |
| JMP 4ed2 <main+0x12c2> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| Function | main |
| Source file and lines | attention_v2.cpp:164-167 |
| Module | attention-aocc-znver5-256 |
| nb instructions | 67 |
| nb uops | 73 |
| loop length | 376 |
| used x86 registers | 5 |
| used mmx registers | 0 |
| used xmm registers | 3 |
| used ymm registers | 0 |
| used zmm registers | 0 |
| nb stack references | 7 |
| micro-operation queue | 9.13 cycles |
| front end | 9.13 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 0.67 | 0.67 | 0.67 | 3.33 | 3.33 | 3.33 | 8.00 | 8.00 | 8.00 | 8.00 | 1.50 | 1.50 | 1.50 | 1.50 | 3.00 | 3.00 |
| cycles | 0.67 | 0.67 | 0.67 | 3.33 | 3.33 | 3.33 | 8.00 | 8.00 | 8.00 | 8.00 | 1.50 | 1.50 | 1.50 | 1.50 | 3.00 | 3.00 |
| Cycles executing div or sqrt instructions | 7.50 |
| Front-end | 9.13 |
| Dispatch | 8.00 |
| DIV/SQRT | 7.50 |
| Overall L1 | 9.13 |
| all | 16% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 100% |
| all | 40% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 0% |
| other | 76% |
| all | 30% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 0% |
| other | 81% |
| all | 14% |
| load | 12% |
| store | 12% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 25% |
| all | 13% |
| load | 6% |
| store | 6% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 6% |
| other | 20% |
| all | 14% |
| load | 9% |
| store | 10% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 6% |
| other | 21% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| VMOVSS -0x3864(%RIP),%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VXORPS %XMM0,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| MOV %R9,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.13 | N/A |
| JMP 47cc <main+0xbbc> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| VDIVSS %XMM1,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 10 | 2.50 | scal (6.3%) |
| VMOVSS -0x3ae8(%RIP),%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VUCOMISS %XMM2,%XMM0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0.50 | 7 | 0.50 | scal (6.3%) |
| JAE 4ef6 <main+0x12e6> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-0.50 | N/A |
| MOV 0x30(%RSP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | N/A |
| VMOVAPS %XMM2,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| VMOVSS %XMM0,(%RAX,%R10,4) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (6.3%) |
| VXORPS %XMM0,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| MOV %R9,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.13 | N/A |
| JMP 4a6c <main+0xe5c> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| VDIVSS %XMM1,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 10 | 2.50 | scal (6.3%) |
| VMOVSS -0x3d88(%RIP),%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VUCOMISS %XMM2,%XMM0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0.50 | 7 | 0.50 | scal (6.3%) |
| JAE 4f3d <main+0x132d> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-0.50 | N/A |
| MOV 0x38(%RSP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | N/A |
| VMOVAPS %XMM2,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| VMOVSS %XMM0,(%RAX,%R10,4) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (6.3%) |
| VXORPS %XMM0,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| MOV %R9,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.13 | N/A |
| JMP 4d0c <main+0x10fc> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| VDIVSS %XMM1,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 10 | 2.50 | scal (6.3%) |
| VUCOMISS -0x4028(%RIP),%XMM0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0.50 | 7 | 0.50 | scal (6.3%) |
| JAE 4f84 <main+0x1374> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-0.50 | N/A |
| MOV 0x110(%RSP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | N/A |
| VMOVSS %XMM0,(%RAX,%R10,4) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (6.3%) |
| INC %R10 | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| CMP 0x2b8(%RSP),%R10 | 1 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| JNE 4700 <main+0xaf0> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33-0.50 | N/A |
| VXORPS %XMM1,%XMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| VMOVAPS %XMM2,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| MOV %R9,0xe0(%RSP) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| MOV %R10,0x120(%RSP) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| MOV %RSI,0x20(%RSP) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| VZEROUPPER | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | vect (25.0%) |
| CALL 70a0 <@plt_start@+0xf0> | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | N/A |
| VMOVSS -0x407f(%RIP),%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| MOV 0x20(%RSP),%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (12.5%) |
| MOV 0x120(%RSP),%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | N/A |
| MOV 0xe0(%RSP),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (12.5%) |
| JMP 4996 <main+0xd86> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| VXORPS %XMM1,%XMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| VMOVAPS %XMM2,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| MOV %R9,0xe0(%RSP) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| MOV %R10,0x120(%RSP) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| MOV %RSI,0x20(%RSP) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| VZEROUPPER | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | vect (25.0%) |
| CALL 70a0 <@plt_start@+0xf0> | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | N/A |
| VMOVSS -0x40c6(%RIP),%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| MOV 0x20(%RSP),%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (12.5%) |
| MOV 0x120(%RSP),%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | N/A |
| MOV 0xe0(%RSP),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (12.5%) |
| JMP 4c36 <main+0x1026> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
| VMOVSS -0x40e8(%RIP),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (6.3%) |
| VXORPS %XMM1,%XMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | vect (25.0%) |
| MOV %R9,0xe0(%RSP) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| MOV %R10,0x120(%RSP) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| MOV %RSI,0x20(%RSP) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (12.5%) |
| VZEROUPPER | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | vect (25.0%) |
| CALL 70a0 <@plt_start@+0xf0> | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | N/A |
| MOV 0x20(%RSP),%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (12.5%) |
| MOV 0x120(%RSP),%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | N/A |
| MOV 0xe0(%RSP),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (12.5%) |
| JMP 4ed2 <main+0x12c2> | 1 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
