| Loop Id: 68 | Module: attention-clang-skl256 | Source: attention_v2.cpp:164-167 [...] | Coverage: 0.07% |
|---|
| Loop Id: 68 | Module: attention-clang-skl256 | Source: attention_v2.cpp:164-167 [...] | Coverage: 0.07% |
|---|
0x32b0 MOV %RCX,%RDX |
0x32b3 INC %RCX |
0x32b6 MOV %RCX,0x1730(%RSP) |
0x32be MOV 0x3b0(%RSP,%RDX,8),%RDX |
0x32c6 MOV %RDX,%RSI |
0x32c9 SHR $0xb,%RSI |
0x32cd MOV %ESI,%ESI |
0x32cf XOR %RDX,%RSI |
0x32d2 MOV %ESI,%EDX |
0x32d4 SAL $0x7,%EDX |
0x32d7 AND $-0x62d3a980,%EDX |
0x32dd XOR %RSI,%RDX |
0x32e0 MOV %EDX,%ESI |
0x32e2 SAL $0xf,%ESI |
0x32e5 AND $-0x103a0000,%ESI |
0x32eb XOR %RDX,%RSI |
0x32ee MOV %RSI,%RDX |
0x32f1 SHR $0x12,%RDX |
0x32f5 XOR %ESI,%EDX |
0x32f7 VCVTUSI2SS %EDX,%XMM15,%XMM0 |
0x32fd VMULSS 0x2d03(%RIP),%XMM0,%XMM0 |
0x3305 VUCOMISS 0x2cff(%RIP),%XMM0 |
0x330d JB 3675 |
0x3313 CMP $0x270,%RCX |
0x331a JB 32b0 |
0x331c VPBROADCASTQ 0x3b0(%RSP),%YMM0 |
0x3326 XOR %ECX,%ECX |
0x3328 VPBROADCASTQ 0x2d07(%RIP),%YMM12 |
0x3331 VPBROADCASTQ 0x2d06(%RIP),%YMM13 |
0x333a VPBROADCASTQ 0x2d05(%RIP),%YMM14 |
0x3343 VPBROADCASTQ 0x2d04(%RIP),%YMM15 |
0x334c NOPL (%RAX) |
(75) 0x3350 VMOVDQA %YMM0,%YMM1 |
(75) 0x3354 VMOVDQU 0x3b8(%RSP,%RCX,8),%YMM2 |
(75) 0x335d VMOVDQU 0x3d8(%RSP,%RCX,8),%YMM3 |
(75) 0x3366 VMOVDQU 0x3f8(%RSP,%RCX,8),%YMM4 |
(75) 0x336f VMOVDQU 0x418(%RSP,%RCX,8),%YMM0 |
(75) 0x3378 VALIGNQ $0x3,%YMM1,%YMM2,%YMM1 |
(75) 0x337f VALIGNQ $0x3,%YMM2,%YMM3,%YMM5 |
(75) 0x3386 VALIGNQ $0x3,%YMM3,%YMM4,%YMM6 |
(75) 0x338d VALIGNQ $0x3,%YMM4,%YMM0,%YMM7 |
(75) 0x3394 VPAND %YMM2,%YMM13,%YMM8 |
(75) 0x3398 VPAND %YMM3,%YMM13,%YMM9 |
(75) 0x339c VPAND %YMM4,%YMM13,%YMM10 |
(75) 0x33a0 VPAND %YMM0,%YMM13,%YMM11 |
(75) 0x33a4 VPTERNLOGQ $-0x8,%YMM12,%YMM1,%YMM8 |
(75) 0x33ab VPTERNLOGQ $-0x8,%YMM12,%YMM5,%YMM9 |
(75) 0x33b2 VPTERNLOGQ $-0x8,%YMM12,%YMM6,%YMM10 |
(75) 0x33b9 VPTERNLOGQ $-0x8,%YMM12,%YMM7,%YMM11 |
(75) 0x33c0 VPSRLQ $0x1,%YMM8,%YMM1 |
(75) 0x33c6 VPSRLQ $0x1,%YMM9,%YMM5 |
(75) 0x33cc VPSRLQ $0x1,%YMM10,%YMM6 |
(75) 0x33d2 VPSRLQ $0x1,%YMM11,%YMM7 |
(75) 0x33d8 VPXOR 0x1018(%RSP,%RCX,8),%YMM1,%YMM1 |
(75) 0x33e1 VPXOR 0x1038(%RSP,%RCX,8),%YMM5,%YMM5 |
(75) 0x33ea VPXOR 0x1058(%RSP,%RCX,8),%YMM6,%YMM6 |
(75) 0x33f3 VPXOR 0x1078(%RSP,%RCX,8),%YMM7,%YMM7 |
(75) 0x33fc VPTESTMQ %YMM14,%YMM2,%K1 |
(75) 0x3402 VPTESTMQ %YMM14,%YMM3,%K2 |
(75) 0x3408 VPTESTMQ %YMM14,%YMM4,%K3 |
(75) 0x340e VPTESTMQ %YMM14,%YMM0,%K4 |
(75) 0x3414 VPXORQ %YMM15,%YMM1,%YMM1{%K1} |
(75) 0x341a VPXORQ %YMM15,%YMM5,%YMM5{%K2} |
(75) 0x3420 VPXORQ %YMM15,%YMM6,%YMM6{%K3} |
(75) 0x3426 VPXORQ %YMM15,%YMM7,%YMM7{%K4} |
(75) 0x342c VMOVDQU %YMM1,0x3b0(%RSP,%RCX,8) |
(75) 0x3435 VMOVDQU %YMM5,0x3d0(%RSP,%RCX,8) |
(75) 0x343e VMOVDQU %YMM6,0x3f0(%RSP,%RCX,8) |
(75) 0x3447 VMOVDQU %YMM7,0x410(%RSP,%RCX,8) |
(75) 0x3450 ADD $0x10,%RCX |
(75) 0x3454 CMP $0xe0,%RCX |
(75) 0x345b JNE 3350 |
0x3461 VEXTRACTI128 $0x1,%YMM0,%XMM0 |
0x3467 VPEXTRQ $0x1,%XMM0,%RSI |
0x346d AND $-0x80000000,%RSI |
0x3474 MOV 0xab8(%RSP),%RDX |
0x347c MOV 0xac0(%RSP),%RCX |
0x3484 MOV %EDX,%EDI |
0x3486 AND $0x7ffffffe,%EDI |
0x348c OR %RSI,%RDI |
0x348f SHR $0x1,%RDI |
0x3492 XOR 0x1718(%RSP),%RDI |
0x349a MOV %EDX,%ESI |
0x349c AND $0x1,%ESI |
0x349f NEG %ESI |
0x34a1 MOV $-0x66f74f21,%R8D |
0x34a7 AND %R8D,%ESI |
0x34aa XOR %RDI,%RSI |
0x34ad MOV %RSI,0xab0(%RSP) |
0x34b5 AND $-0x80000000,%RDX |
0x34bc MOV %ECX,%ESI |
0x34be AND $0x7ffffffe,%ESI |
0x34c4 OR %RDX,%RSI |
0x34c7 SHR $0x1,%RSI |
0x34ca XOR 0x1720(%RSP),%RSI |
0x34d2 MOV %ECX,%EDX |
0x34d4 AND $0x1,%EDX |
0x34d7 NEG %EDX |
0x34d9 AND %R8D,%EDX |
0x34dc XOR %RSI,%RDX |
0x34df MOV %RDX,0xab8(%RSP) |
0x34e7 AND $-0x80000000,%RCX |
0x34ee MOV 0xac8(%RSP),%RDX |
0x34f6 MOV %EDX,%ESI |
0x34f8 VPBROADCASTQ %RDX,%XMM0 |
0x34fe AND $0x7ffffffe,%EDX |
0x3504 OR %RCX,%RDX |
0x3507 SHR $0x1,%RDX |
0x350a XOR 0x1728(%RSP),%RDX |
0x3512 AND $0x1,%ESI |
0x3515 NEG %ESI |
0x3517 MOV $-0x66f74f21,%EDI |
0x351c AND %R8D,%ESI |
0x351f XOR %RDX,%RSI |
0x3522 MOV %RSI,0xac0(%RSP) |
0x352a MOV $0xe8,%ECX |
0x352f VPBROADCASTQ 0x2b00(%RIP),%XMM5 |
0x3538 VPBROADCASTQ 0x2aff(%RIP),%XMM6 |
0x3541 VPBROADCASTQ 0x2afe(%RIP),%XMM7 |
0x354a VPBROADCASTQ 0x2afd(%RIP),%XMM8 |
0x3553 NOPW %CS:(%RAX,%RAX,1) |
(76) 0x3560 VMOVDQU 0x390(%RSP,%RCX,8),%XMM1 |
(76) 0x3569 VMOVDQU 0x3a0(%RSP,%RCX,8),%XMM2 |
(76) 0x3572 VPALIGNR $0x8,%XMM0,%XMM1,%XMM0 |
(76) 0x3578 VMOVDQU 0x3b0(%RSP,%RCX,8),%XMM3 |
(76) 0x3581 VPAND %XMM6,%XMM1,%XMM4 |
(76) 0x3585 VPTERNLOGQ $-0x8,%XMM5,%XMM0,%XMM4 |
(76) 0x358c VPSRLQ $0x1,%XMM4,%XMM0 |
(76) 0x3591 VPXOR -0x390(%RSP,%RCX,8),%XMM0,%XMM0 |
(76) 0x359a VPTESTMQ %XMM7,%XMM1,%K1 |
(76) 0x35a0 VPXORQ %XMM8,%XMM0,%XMM0{%K1} |
(76) 0x35a6 VMOVDQU %XMM0,0x388(%RSP,%RCX,8) |
(76) 0x35af VPALIGNR $0x8,%XMM1,%XMM2,%XMM0 |
(76) 0x35b5 VPAND %XMM6,%XMM2,%XMM1 |
(76) 0x35b9 VPTERNLOGQ $-0x8,%XMM5,%XMM0,%XMM1 |
(76) 0x35c0 VPSRLQ $0x1,%XMM1,%XMM0 |
(76) 0x35c5 VPXOR -0x380(%RSP,%RCX,8),%XMM0,%XMM0 |
(76) 0x35ce VPTESTMQ %XMM7,%XMM2,%K1 |
(76) 0x35d4 VPXORQ %XMM8,%XMM0,%XMM0{%K1} |
(76) 0x35da VMOVDQU %XMM0,0x398(%RSP,%RCX,8) |
(76) 0x35e3 VPALIGNR $0x8,%XMM2,%XMM3,%XMM0 |
(76) 0x35e9 VPAND %XMM6,%XMM3,%XMM1 |
(76) 0x35ed VPTERNLOGQ $-0x8,%XMM5,%XMM0,%XMM1 |
(76) 0x35f4 VPSRLQ $0x1,%XMM1,%XMM0 |
(76) 0x35f9 VPXOR -0x370(%RSP,%RCX,8),%XMM0,%XMM0 |
(76) 0x3602 VPTESTMQ %XMM7,%XMM3,%K1 |
(76) 0x3608 VPXORQ %XMM8,%XMM0,%XMM0{%K1} |
(76) 0x360e VMOVDQU %XMM0,0x3a8(%RSP,%RCX,8) |
(76) 0x3617 ADD $0x6,%RCX |
(76) 0x361b VMOVDQA %XMM3,%XMM0 |
(76) 0x361f CMP $0x274,%RCX |
(76) 0x3626 JNE 3560 |
0x362c MOV 0x1728(%RSP),%RCX |
0x3634 MOV $-0x80000000,%RDX |
0x363b AND %RDX,%RCX |
0x363e MOV 0x3b0(%RSP),%RDX |
0x3646 MOV %EDX,%ESI |
0x3648 AND $0x7ffffffe,%ESI |
0x364e OR %RCX,%RSI |
0x3651 SHR $0x1,%RSI |
0x3654 XOR 0x1010(%RSP),%RSI |
0x365c AND $0x1,%EDX |
0x365f NEG %EDX |
0x3661 AND %EDI,%EDX |
0x3663 XOR %RSI,%RDX |
0x3666 MOV %RDX,0x1728(%RSP) |
0x366e XOR %ECX,%ECX |
0x3670 JMP 32b0 |
0x3675 VMOVSS %XMM0,(%RBX,%RAX,4) |
0x367a JMP 36e3 |
(72) 0x3680 MOV %RCX,%RDX |
(72) 0x3683 INC %RCX |
(72) 0x3686 MOV %RCX,0x1730(%RSP) |
(72) 0x368e MOV 0x3b0(%RSP,%RDX,8),%RDX |
(72) 0x3696 MOV %RDX,%RSI |
(72) 0x3699 SHR $0xb,%RSI |
(72) 0x369d MOV %ESI,%ESI |
(72) 0x369f XOR %RDX,%RSI |
(72) 0x36a2 MOV %ESI,%EDX |
(72) 0x36a4 SAL $0x7,%EDX |
(72) 0x36a7 AND $-0x62d3a980,%EDX |
(72) 0x36ad XOR %RSI,%RDX |
(72) 0x36b0 MOV %EDX,%ESI |
(72) 0x36b2 SAL $0xf,%ESI |
(72) 0x36b5 AND $-0x103a0000,%ESI |
(72) 0x36bb XOR %RDX,%RSI |
(72) 0x36be MOV %RSI,%RDX |
(72) 0x36c1 SHR $0x12,%RDX |
(72) 0x36c5 XOR %ESI,%EDX |
(72) 0x36c7 VCVTUSI2SS %EDX,%XMM15,%XMM0 |
(72) 0x36cd VMULSS 0x2933(%RIP),%XMM0,%XMM0 |
(72) 0x36d5 VUCOMISS 0x292f(%RIP),%XMM0 |
(72) 0x36dd JB 3a45 |
(72) 0x36e3 CMP $0x270,%RCX |
(72) 0x36ea JB 3680 |
(72) 0x36ec VPBROADCASTQ 0x3b0(%RSP),%YMM0 |
(72) 0x36f6 XOR %ECX,%ECX |
(72) 0x36f8 VPBROADCASTQ 0x2937(%RIP),%YMM12 |
(72) 0x3701 VPBROADCASTQ 0x2936(%RIP),%YMM13 |
(72) 0x370a VPBROADCASTQ 0x2935(%RIP),%YMM14 |
(72) 0x3713 VPBROADCASTQ 0x2934(%RIP),%YMM15 |
(72) 0x371c NOPL (%RAX) |
(73) 0x3720 VMOVDQA %YMM0,%YMM1 |
(73) 0x3724 VMOVDQU 0x3b8(%RSP,%RCX,8),%YMM2 |
(73) 0x372d VMOVDQU 0x3d8(%RSP,%RCX,8),%YMM3 |
(73) 0x3736 VMOVDQU 0x3f8(%RSP,%RCX,8),%YMM4 |
(73) 0x373f VMOVDQU 0x418(%RSP,%RCX,8),%YMM0 |
(73) 0x3748 VALIGNQ $0x3,%YMM1,%YMM2,%YMM1 |
(73) 0x374f VALIGNQ $0x3,%YMM2,%YMM3,%YMM5 |
(73) 0x3756 VALIGNQ $0x3,%YMM3,%YMM4,%YMM6 |
(73) 0x375d VALIGNQ $0x3,%YMM4,%YMM0,%YMM7 |
(73) 0x3764 VPAND %YMM2,%YMM13,%YMM8 |
(73) 0x3768 VPAND %YMM3,%YMM13,%YMM9 |
(73) 0x376c VPAND %YMM4,%YMM13,%YMM10 |
(73) 0x3770 VPAND %YMM0,%YMM13,%YMM11 |
(73) 0x3774 VPTERNLOGQ $-0x8,%YMM12,%YMM1,%YMM8 |
(73) 0x377b VPTERNLOGQ $-0x8,%YMM12,%YMM5,%YMM9 |
(73) 0x3782 VPTERNLOGQ $-0x8,%YMM12,%YMM6,%YMM10 |
(73) 0x3789 VPTERNLOGQ $-0x8,%YMM12,%YMM7,%YMM11 |
(73) 0x3790 VPSRLQ $0x1,%YMM8,%YMM1 |
(73) 0x3796 VPSRLQ $0x1,%YMM9,%YMM5 |
(73) 0x379c VPSRLQ $0x1,%YMM10,%YMM6 |
(73) 0x37a2 VPSRLQ $0x1,%YMM11,%YMM7 |
(73) 0x37a8 VPXOR 0x1018(%RSP,%RCX,8),%YMM1,%YMM1 |
(73) 0x37b1 VPXOR 0x1038(%RSP,%RCX,8),%YMM5,%YMM5 |
(73) 0x37ba VPXOR 0x1058(%RSP,%RCX,8),%YMM6,%YMM6 |
(73) 0x37c3 VPXOR 0x1078(%RSP,%RCX,8),%YMM7,%YMM7 |
(73) 0x37cc VPTESTMQ %YMM14,%YMM2,%K1 |
(73) 0x37d2 VPTESTMQ %YMM14,%YMM3,%K2 |
(73) 0x37d8 VPTESTMQ %YMM14,%YMM4,%K3 |
(73) 0x37de VPTESTMQ %YMM14,%YMM0,%K4 |
(73) 0x37e4 VPXORQ %YMM15,%YMM1,%YMM1{%K1} |
(73) 0x37ea VPXORQ %YMM15,%YMM5,%YMM5{%K2} |
(73) 0x37f0 VPXORQ %YMM15,%YMM6,%YMM6{%K3} |
(73) 0x37f6 VPXORQ %YMM15,%YMM7,%YMM7{%K4} |
(73) 0x37fc VMOVDQU %YMM1,0x3b0(%RSP,%RCX,8) |
(73) 0x3805 VMOVDQU %YMM5,0x3d0(%RSP,%RCX,8) |
(73) 0x380e VMOVDQU %YMM6,0x3f0(%RSP,%RCX,8) |
(73) 0x3817 VMOVDQU %YMM7,0x410(%RSP,%RCX,8) |
(73) 0x3820 ADD $0x10,%RCX |
(73) 0x3824 CMP $0xe0,%RCX |
(73) 0x382b JNE 3720 |
(72) 0x3831 VEXTRACTI128 $0x1,%YMM0,%XMM0 |
(72) 0x3837 VPEXTRQ $0x1,%XMM0,%RSI |
(72) 0x383d AND $-0x80000000,%RSI |
(72) 0x3844 MOV 0xab8(%RSP),%RDX |
(72) 0x384c MOV 0xac0(%RSP),%RCX |
(72) 0x3854 MOV %EDX,%EDI |
(72) 0x3856 AND $0x7ffffffe,%EDI |
(72) 0x385c OR %RSI,%RDI |
(72) 0x385f SHR $0x1,%RDI |
(72) 0x3862 XOR 0x1718(%RSP),%RDI |
(72) 0x386a MOV %EDX,%ESI |
(72) 0x386c AND $0x1,%ESI |
(72) 0x386f NEG %ESI |
(72) 0x3871 MOV $-0x66f74f21,%R8D |
(72) 0x3877 AND %R8D,%ESI |
(72) 0x387a XOR %RDI,%RSI |
(72) 0x387d MOV %RSI,0xab0(%RSP) |
(72) 0x3885 AND $-0x80000000,%RDX |
(72) 0x388c MOV %ECX,%ESI |
(72) 0x388e AND $0x7ffffffe,%ESI |
(72) 0x3894 OR %RDX,%RSI |
(72) 0x3897 SHR $0x1,%RSI |
(72) 0x389a XOR 0x1720(%RSP),%RSI |
(72) 0x38a2 MOV %ECX,%EDX |
(72) 0x38a4 AND $0x1,%EDX |
(72) 0x38a7 NEG %EDX |
(72) 0x38a9 AND %R8D,%EDX |
(72) 0x38ac XOR %RSI,%RDX |
(72) 0x38af MOV %RDX,0xab8(%RSP) |
(72) 0x38b7 AND $-0x80000000,%RCX |
(72) 0x38be MOV 0xac8(%RSP),%RDX |
(72) 0x38c6 MOV %EDX,%ESI |
(72) 0x38c8 VPBROADCASTQ %RDX,%XMM0 |
(72) 0x38ce AND $0x7ffffffe,%EDX |
(72) 0x38d4 OR %RCX,%RDX |
(72) 0x38d7 SHR $0x1,%RDX |
(72) 0x38da XOR 0x1728(%RSP),%RDX |
(72) 0x38e2 AND $0x1,%ESI |
(72) 0x38e5 NEG %ESI |
(72) 0x38e7 MOV $-0x66f74f21,%EDI |
(72) 0x38ec AND %R8D,%ESI |
(72) 0x38ef XOR %RDX,%RSI |
(72) 0x38f2 MOV %RSI,0xac0(%RSP) |
(72) 0x38fa MOV $0xe8,%ECX |
(72) 0x38ff VPBROADCASTQ 0x2730(%RIP),%XMM5 |
(72) 0x3908 VPBROADCASTQ 0x272f(%RIP),%XMM6 |
(72) 0x3911 VPBROADCASTQ 0x272e(%RIP),%XMM7 |
(72) 0x391a VPBROADCASTQ 0x272d(%RIP),%XMM8 |
(72) 0x3923 NOPW %CS:(%RAX,%RAX,1) |
(74) 0x3930 VMOVDQU 0x390(%RSP,%RCX,8),%XMM1 |
(74) 0x3939 VMOVDQU 0x3a0(%RSP,%RCX,8),%XMM2 |
(74) 0x3942 VPALIGNR $0x8,%XMM0,%XMM1,%XMM0 |
(74) 0x3948 VMOVDQU 0x3b0(%RSP,%RCX,8),%XMM3 |
(74) 0x3951 VPAND %XMM6,%XMM1,%XMM4 |
(74) 0x3955 VPTERNLOGQ $-0x8,%XMM5,%XMM0,%XMM4 |
(74) 0x395c VPSRLQ $0x1,%XMM4,%XMM0 |
(74) 0x3961 VPXOR -0x390(%RSP,%RCX,8),%XMM0,%XMM0 |
(74) 0x396a VPTESTMQ %XMM7,%XMM1,%K1 |
(74) 0x3970 VPXORQ %XMM8,%XMM0,%XMM0{%K1} |
(74) 0x3976 VMOVDQU %XMM0,0x388(%RSP,%RCX,8) |
(74) 0x397f VPALIGNR $0x8,%XMM1,%XMM2,%XMM0 |
(74) 0x3985 VPAND %XMM6,%XMM2,%XMM1 |
(74) 0x3989 VPTERNLOGQ $-0x8,%XMM5,%XMM0,%XMM1 |
(74) 0x3990 VPSRLQ $0x1,%XMM1,%XMM0 |
(74) 0x3995 VPXOR -0x380(%RSP,%RCX,8),%XMM0,%XMM0 |
(74) 0x399e VPTESTMQ %XMM7,%XMM2,%K1 |
(74) 0x39a4 VPXORQ %XMM8,%XMM0,%XMM0{%K1} |
(74) 0x39aa VMOVDQU %XMM0,0x398(%RSP,%RCX,8) |
(74) 0x39b3 VPALIGNR $0x8,%XMM2,%XMM3,%XMM0 |
(74) 0x39b9 VPAND %XMM6,%XMM3,%XMM1 |
(74) 0x39bd VPTERNLOGQ $-0x8,%XMM5,%XMM0,%XMM1 |
(74) 0x39c4 VPSRLQ $0x1,%XMM1,%XMM0 |
(74) 0x39c9 VPXOR -0x370(%RSP,%RCX,8),%XMM0,%XMM0 |
(74) 0x39d2 VPTESTMQ %XMM7,%XMM3,%K1 |
(74) 0x39d8 VPXORQ %XMM8,%XMM0,%XMM0{%K1} |
(74) 0x39de VMOVDQU %XMM0,0x3a8(%RSP,%RCX,8) |
(74) 0x39e7 ADD $0x6,%RCX |
(74) 0x39eb VMOVDQA %XMM3,%XMM0 |
(74) 0x39ef CMP $0x274,%RCX |
(74) 0x39f6 JNE 3930 |
(72) 0x39fc MOV 0x1728(%RSP),%RCX |
(72) 0x3a04 MOV $-0x80000000,%RDX |
(72) 0x3a0b AND %RDX,%RCX |
(72) 0x3a0e MOV 0x3b0(%RSP),%RDX |
(72) 0x3a16 MOV %EDX,%ESI |
(72) 0x3a18 AND $0x7ffffffe,%ESI |
(72) 0x3a1e OR %RCX,%RSI |
(72) 0x3a21 SHR $0x1,%RSI |
(72) 0x3a24 XOR 0x1010(%RSP),%RSI |
(72) 0x3a2c AND $0x1,%EDX |
(72) 0x3a2f NEG %EDX |
(72) 0x3a31 AND %EDI,%EDX |
(72) 0x3a33 XOR %RSI,%RDX |
(72) 0x3a36 MOV %RDX,0x1728(%RSP) |
(72) 0x3a3e XOR %ECX,%ECX |
(72) 0x3a40 JMP 3680 |
0x3a45 MOV 0x58(%RSP),%RDX |
0x3a4a VMOVSS %XMM0,(%RDX,%RAX,4) |
0x3a4f JMP 3ac3 |
(69) 0x3a60 MOV %RCX,%RDX |
(69) 0x3a63 INC %RCX |
(69) 0x3a66 MOV %RCX,0x1730(%RSP) |
(69) 0x3a6e MOV 0x3b0(%RSP,%RDX,8),%RDX |
(69) 0x3a76 MOV %RDX,%RSI |
(69) 0x3a79 SHR $0xb,%RSI |
(69) 0x3a7d MOV %ESI,%ESI |
(69) 0x3a7f XOR %RDX,%RSI |
(69) 0x3a82 MOV %ESI,%EDX |
(69) 0x3a84 SAL $0x7,%EDX |
(69) 0x3a87 AND $-0x62d3a980,%EDX |
(69) 0x3a8d XOR %RSI,%RDX |
(69) 0x3a90 MOV %EDX,%ESI |
(69) 0x3a92 SAL $0xf,%ESI |
(69) 0x3a95 AND $-0x103a0000,%ESI |
(69) 0x3a9b XOR %RDX,%RSI |
(69) 0x3a9e MOV %RSI,%RDX |
(69) 0x3aa1 SHR $0x12,%RDX |
(69) 0x3aa5 XOR %ESI,%EDX |
(69) 0x3aa7 VCVTUSI2SS %EDX,%XMM15,%XMM0 |
(69) 0x3aad VMULSS 0x2553(%RIP),%XMM0,%XMM0 |
(69) 0x3ab5 VUCOMISS 0x254f(%RIP),%XMM0 |
(69) 0x3abd JB 3e25 |
(69) 0x3ac3 CMP $0x270,%RCX |
(69) 0x3aca JB 3a60 |
(69) 0x3acc VPBROADCASTQ 0x3b0(%RSP),%YMM0 |
(69) 0x3ad6 XOR %ECX,%ECX |
(69) 0x3ad8 VPBROADCASTQ 0x2557(%RIP),%YMM12 |
(69) 0x3ae1 VPBROADCASTQ 0x2556(%RIP),%YMM13 |
(69) 0x3aea VPBROADCASTQ 0x2555(%RIP),%YMM14 |
(69) 0x3af3 VPBROADCASTQ 0x2554(%RIP),%YMM15 |
(69) 0x3afc NOPL (%RAX) |
(70) 0x3b00 VMOVDQA %YMM0,%YMM1 |
(70) 0x3b04 VMOVDQU 0x3b8(%RSP,%RCX,8),%YMM2 |
(70) 0x3b0d VMOVDQU 0x3d8(%RSP,%RCX,8),%YMM3 |
(70) 0x3b16 VMOVDQU 0x3f8(%RSP,%RCX,8),%YMM4 |
(70) 0x3b1f VMOVDQU 0x418(%RSP,%RCX,8),%YMM0 |
(70) 0x3b28 VALIGNQ $0x3,%YMM1,%YMM2,%YMM1 |
(70) 0x3b2f VALIGNQ $0x3,%YMM2,%YMM3,%YMM5 |
(70) 0x3b36 VALIGNQ $0x3,%YMM3,%YMM4,%YMM6 |
(70) 0x3b3d VALIGNQ $0x3,%YMM4,%YMM0,%YMM7 |
(70) 0x3b44 VPAND %YMM2,%YMM13,%YMM8 |
(70) 0x3b48 VPAND %YMM3,%YMM13,%YMM9 |
(70) 0x3b4c VPAND %YMM4,%YMM13,%YMM10 |
(70) 0x3b50 VPAND %YMM0,%YMM13,%YMM11 |
(70) 0x3b54 VPTERNLOGQ $-0x8,%YMM12,%YMM1,%YMM8 |
(70) 0x3b5b VPTERNLOGQ $-0x8,%YMM12,%YMM5,%YMM9 |
(70) 0x3b62 VPTERNLOGQ $-0x8,%YMM12,%YMM6,%YMM10 |
(70) 0x3b69 VPTERNLOGQ $-0x8,%YMM12,%YMM7,%YMM11 |
(70) 0x3b70 VPSRLQ $0x1,%YMM8,%YMM1 |
(70) 0x3b76 VPSRLQ $0x1,%YMM9,%YMM5 |
(70) 0x3b7c VPSRLQ $0x1,%YMM10,%YMM6 |
(70) 0x3b82 VPSRLQ $0x1,%YMM11,%YMM7 |
(70) 0x3b88 VPXOR 0x1018(%RSP,%RCX,8),%YMM1,%YMM1 |
(70) 0x3b91 VPXOR 0x1038(%RSP,%RCX,8),%YMM5,%YMM5 |
(70) 0x3b9a VPXOR 0x1058(%RSP,%RCX,8),%YMM6,%YMM6 |
(70) 0x3ba3 VPXOR 0x1078(%RSP,%RCX,8),%YMM7,%YMM7 |
(70) 0x3bac VPTESTMQ %YMM14,%YMM2,%K1 |
(70) 0x3bb2 VPTESTMQ %YMM14,%YMM3,%K2 |
(70) 0x3bb8 VPTESTMQ %YMM14,%YMM4,%K3 |
(70) 0x3bbe VPTESTMQ %YMM14,%YMM0,%K4 |
(70) 0x3bc4 VPXORQ %YMM15,%YMM1,%YMM1{%K1} |
(70) 0x3bca VPXORQ %YMM15,%YMM5,%YMM5{%K2} |
(70) 0x3bd0 VPXORQ %YMM15,%YMM6,%YMM6{%K3} |
(70) 0x3bd6 VPXORQ %YMM15,%YMM7,%YMM7{%K4} |
(70) 0x3bdc VMOVDQU %YMM1,0x3b0(%RSP,%RCX,8) |
(70) 0x3be5 VMOVDQU %YMM5,0x3d0(%RSP,%RCX,8) |
(70) 0x3bee VMOVDQU %YMM6,0x3f0(%RSP,%RCX,8) |
(70) 0x3bf7 VMOVDQU %YMM7,0x410(%RSP,%RCX,8) |
(70) 0x3c00 ADD $0x10,%RCX |
(70) 0x3c04 CMP $0xe0,%RCX |
(70) 0x3c0b JNE 3b00 |
(69) 0x3c11 VEXTRACTI128 $0x1,%YMM0,%XMM0 |
(69) 0x3c17 VPEXTRQ $0x1,%XMM0,%RSI |
(69) 0x3c1d AND $-0x80000000,%RSI |
(69) 0x3c24 MOV 0xab8(%RSP),%RDX |
(69) 0x3c2c MOV 0xac0(%RSP),%RCX |
(69) 0x3c34 MOV %EDX,%EDI |
(69) 0x3c36 AND $0x7ffffffe,%EDI |
(69) 0x3c3c OR %RSI,%RDI |
(69) 0x3c3f SHR $0x1,%RDI |
(69) 0x3c42 XOR 0x1718(%RSP),%RDI |
(69) 0x3c4a MOV %EDX,%ESI |
(69) 0x3c4c AND $0x1,%ESI |
(69) 0x3c4f NEG %ESI |
(69) 0x3c51 MOV $-0x66f74f21,%R8D |
(69) 0x3c57 AND %R8D,%ESI |
(69) 0x3c5a XOR %RDI,%RSI |
(69) 0x3c5d MOV %RSI,0xab0(%RSP) |
(69) 0x3c65 AND $-0x80000000,%RDX |
(69) 0x3c6c MOV %ECX,%ESI |
(69) 0x3c6e AND $0x7ffffffe,%ESI |
(69) 0x3c74 OR %RDX,%RSI |
(69) 0x3c77 SHR $0x1,%RSI |
(69) 0x3c7a XOR 0x1720(%RSP),%RSI |
(69) 0x3c82 MOV %ECX,%EDX |
(69) 0x3c84 AND $0x1,%EDX |
(69) 0x3c87 NEG %EDX |
(69) 0x3c89 AND %R8D,%EDX |
(69) 0x3c8c XOR %RSI,%RDX |
(69) 0x3c8f MOV %RDX,0xab8(%RSP) |
(69) 0x3c97 AND $-0x80000000,%RCX |
(69) 0x3c9e MOV 0xac8(%RSP),%RDX |
(69) 0x3ca6 MOV %EDX,%ESI |
(69) 0x3ca8 VPBROADCASTQ %RDX,%XMM0 |
(69) 0x3cae AND $0x7ffffffe,%EDX |
(69) 0x3cb4 OR %RCX,%RDX |
(69) 0x3cb7 SHR $0x1,%RDX |
(69) 0x3cba XOR 0x1728(%RSP),%RDX |
(69) 0x3cc2 AND $0x1,%ESI |
(69) 0x3cc5 NEG %ESI |
(69) 0x3cc7 MOV $-0x66f74f21,%EDI |
(69) 0x3ccc AND %R8D,%ESI |
(69) 0x3ccf XOR %RDX,%RSI |
(69) 0x3cd2 MOV %RSI,0xac0(%RSP) |
(69) 0x3cda MOV $0xe8,%ECX |
(69) 0x3cdf VPBROADCASTQ 0x2350(%RIP),%XMM5 |
(69) 0x3ce8 VPBROADCASTQ 0x234f(%RIP),%XMM6 |
(69) 0x3cf1 VPBROADCASTQ 0x234e(%RIP),%XMM7 |
(69) 0x3cfa VPBROADCASTQ 0x234d(%RIP),%XMM8 |
(69) 0x3d03 NOPW %CS:(%RAX,%RAX,1) |
(71) 0x3d10 VMOVDQU 0x390(%RSP,%RCX,8),%XMM1 |
(71) 0x3d19 VMOVDQU 0x3a0(%RSP,%RCX,8),%XMM2 |
(71) 0x3d22 VPALIGNR $0x8,%XMM0,%XMM1,%XMM0 |
(71) 0x3d28 VMOVDQU 0x3b0(%RSP,%RCX,8),%XMM3 |
(71) 0x3d31 VPAND %XMM6,%XMM1,%XMM4 |
(71) 0x3d35 VPTERNLOGQ $-0x8,%XMM5,%XMM0,%XMM4 |
(71) 0x3d3c VPSRLQ $0x1,%XMM4,%XMM0 |
(71) 0x3d41 VPXOR -0x390(%RSP,%RCX,8),%XMM0,%XMM0 |
(71) 0x3d4a VPTESTMQ %XMM7,%XMM1,%K1 |
(71) 0x3d50 VPXORQ %XMM8,%XMM0,%XMM0{%K1} |
(71) 0x3d56 VMOVDQU %XMM0,0x388(%RSP,%RCX,8) |
(71) 0x3d5f VPALIGNR $0x8,%XMM1,%XMM2,%XMM0 |
(71) 0x3d65 VPAND %XMM6,%XMM2,%XMM1 |
(71) 0x3d69 VPTERNLOGQ $-0x8,%XMM5,%XMM0,%XMM1 |
(71) 0x3d70 VPSRLQ $0x1,%XMM1,%XMM0 |
(71) 0x3d75 VPXOR -0x380(%RSP,%RCX,8),%XMM0,%XMM0 |
(71) 0x3d7e VPTESTMQ %XMM7,%XMM2,%K1 |
(71) 0x3d84 VPXORQ %XMM8,%XMM0,%XMM0{%K1} |
(71) 0x3d8a VMOVDQU %XMM0,0x398(%RSP,%RCX,8) |
(71) 0x3d93 VPALIGNR $0x8,%XMM2,%XMM3,%XMM0 |
(71) 0x3d99 VPAND %XMM6,%XMM3,%XMM1 |
(71) 0x3d9d VPTERNLOGQ $-0x8,%XMM5,%XMM0,%XMM1 |
(71) 0x3da4 VPSRLQ $0x1,%XMM1,%XMM0 |
(71) 0x3da9 VPXOR -0x370(%RSP,%RCX,8),%XMM0,%XMM0 |
(71) 0x3db2 VPTESTMQ %XMM7,%XMM3,%K1 |
(71) 0x3db8 VPXORQ %XMM8,%XMM0,%XMM0{%K1} |
(71) 0x3dbe VMOVDQU %XMM0,0x3a8(%RSP,%RCX,8) |
(71) 0x3dc7 ADD $0x6,%RCX |
(71) 0x3dcb VMOVDQA %XMM3,%XMM0 |
(71) 0x3dcf CMP $0x274,%RCX |
(71) 0x3dd6 JNE 3d10 |
(69) 0x3ddc MOV 0x1728(%RSP),%RCX |
(69) 0x3de4 MOV $-0x80000000,%RDX |
(69) 0x3deb AND %RDX,%RCX |
(69) 0x3dee MOV 0x3b0(%RSP),%RDX |
(69) 0x3df6 MOV %EDX,%ESI |
(69) 0x3df8 AND $0x7ffffffe,%ESI |
(69) 0x3dfe OR %RCX,%RSI |
(69) 0x3e01 SHR $0x1,%RSI |
(69) 0x3e04 XOR 0x1010(%RSP),%RSI |
(69) 0x3e0c AND $0x1,%EDX |
(69) 0x3e0f NEG %EDX |
(69) 0x3e11 AND %EDI,%EDX |
(69) 0x3e13 XOR %RSI,%RDX |
(69) 0x3e16 MOV %RDX,0x1728(%RSP) |
(69) 0x3e1e XOR %ECX,%ECX |
(69) 0x3e20 JMP 3a60 |
0x3e25 MOV 0x90(%RSP),%RDX |
0x3e2d VMOVSS %XMM0,(%RDX,%RAX,4) |
0x3e32 INC %RAX |
0x3e35 CMP 0x50(%RSP),%RAX |
0x3e3a JNE 3313 |
/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/16.1.1/../../../../include/c++/16.1.1/bits/random.tcc: 404 - 3558 |
-------------------------------------------------------------------------------- |
404: for (size_t __k = 0; __k < (__n - __m); ++__k) |
405: { |
406: _UIntType __y = ((_M_x[__k] & __upper_mask) |
407: | (_M_x[__k + 1] & __lower_mask)); |
408: _M_x[__k] = (_M_x[__k + __m] ^ (__y >> 1) |
409: ^ ((__y & 0x01) ? __a : 0)); |
410: } |
411: |
412: for (size_t __k = (__n - __m); __k < (__n - 1); ++__k) |
413: { |
414: _UIntType __y = ((_M_x[__k] & __upper_mask) |
415: | (_M_x[__k + 1] & __lower_mask)); |
416: _M_x[__k] = (_M_x[__k + (__m - __n)] ^ (__y >> 1) |
417: ^ ((__y & 0x01) ? __a : 0)); |
418: } |
419: |
420: _UIntType __y = ((_M_x[__n - 1] & __upper_mask) |
421: | (_M_x[0] & __lower_mask)); |
422: _M_x[__n - 1] = (_M_x[__m - 1] ^ (__y >> 1) |
423: ^ ((__y & 0x01) ? __a : 0)); |
[...] |
458: if (_M_p >= state_size) |
459: _M_gen_rand(); |
460: |
461: // Calculate o(x(i)). |
462: result_type __z = _M_x[_M_p++]; |
463: __z ^= (__z >> __u) & __d; |
464: __z ^= (__z << __s) & __b; |
465: __z ^= (__z << __t) & __c; |
466: __z ^= (__z >> __l); |
[...] |
3557: const _RealT __ret = _RealT(__sum >> __log2_x) / _RealT(__rd); |
3558: if (__ret < _RealT(1.0)) |
/home/eoseret/Applications/llm-attention/attention_v2.cpp: 164 - 167 |
-------------------------------------------------------------------------------- |
164: for (size_t i = 0; i < elemsW; ++i) { |
165: h_WQ[i] = dist(rng); |
166: h_WK[i] = dist(rng); |
167: h_WV[i] = dist(rng); |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | __libc_init_first | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | attention-clang-skl256 |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 4.36 |
| CQA speedup if FP arith vectorized | 1.98 |
| CQA speedup if fully vectorized | 11.97 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.63 |
| Bottlenecks | micro-operation queue, |
| Function | main |
| Source | random.tcc:404-409,random.tcc:412-412,random.tcc:420-423,random.tcc:458-458,random.tcc:462-466,random.tcc:3557-3558,attention_v2.cpp:164-167 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 27.25 |
| CQA cycles if no scalar integer | 6.25 |
| CQA cycles if FP arith vectorized | 13.76 |
| CQA cycles if fully vectorized | 2.28 |
| Front-end cycles | 27.25 |
| P0 cycles | 16.75 |
| P1 cycles | 16.75 |
| P2 cycles | 12.00 |
| P3 cycles | 12.00 |
| P4 cycles | 8.00 |
| P5 cycles | 16.75 |
| P6 cycles | 16.75 |
| P7 cycles | 8.00 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 107.00 |
| Nb uops | 109.00 |
| Nb loads | 24.00 |
| Nb stores | 8.00 |
| Nb stack references | 13.00 |
| FLOP/cycle | 0.04 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 1.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 8.66 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 184.00 |
| Bytes stored | 52.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 1.47 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | 0.00 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 1.79 |
| Vector-efficiency ratio all | 10.02 |
| Vector-efficiency ratio load | 11.72 |
| Vector-efficiency ratio store | 10.16 |
| Vector-efficiency ratio mul | 6.25 |
| Vector-efficiency ratio add_sub | 12.50 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 9.93 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 4.36 |
| CQA speedup if FP arith vectorized | 1.98 |
| CQA speedup if fully vectorized | 11.97 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.63 |
| Bottlenecks | micro-operation queue, |
| Function | main |
| Source | random.tcc:404-409,random.tcc:412-412,random.tcc:420-423,random.tcc:458-458,random.tcc:462-466,random.tcc:3557-3558,attention_v2.cpp:164-167 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 27.25 |
| CQA cycles if no scalar integer | 6.25 |
| CQA cycles if FP arith vectorized | 13.76 |
| CQA cycles if fully vectorized | 2.28 |
| Front-end cycles | 27.25 |
| P0 cycles | 16.75 |
| P1 cycles | 16.75 |
| P2 cycles | 12.00 |
| P3 cycles | 12.00 |
| P4 cycles | 8.00 |
| P5 cycles | 16.75 |
| P6 cycles | 16.75 |
| P7 cycles | 8.00 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 107.00 |
| Nb uops | 109.00 |
| Nb loads | 24.00 |
| Nb stores | 8.00 |
| Nb stack references | 13.00 |
| FLOP/cycle | 0.04 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 1.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 8.66 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 184.00 |
| Bytes stored | 52.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 1.47 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | 0.00 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 1.79 |
| Vector-efficiency ratio all | 10.02 |
| Vector-efficiency ratio load | 11.72 |
| Vector-efficiency ratio store | 10.16 |
| Vector-efficiency ratio mul | 6.25 |
| Vector-efficiency ratio add_sub | 12.50 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 9.93 |
| Path / |
| Function | main |
| Source file and lines | attention_v2.cpp:164-167 |
| Module | attention-clang-skl256 |
| nb instructions | 107 |
| nb uops | 109 |
| loop length | 534 |
| used x86 registers | 8 |
| used mmx registers | 0 |
| used xmm registers | 6 |
| used ymm registers | 5 |
| used zmm registers | 0 |
| nb stack references | 13 |
| micro-operation queue | 27.25 cycles |
| front end | 27.25 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
|---|---|---|---|---|---|---|---|---|
| uops | 16.75 | 16.75 | 12.00 | 12.00 | 8.00 | 16.75 | 16.75 | 8.00 |
| cycles | 16.75 | 16.75 | 12.00 | 12.00 | 8.00 | 16.75 | 16.75 | 8.00 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 27.25 |
| Dispatch | 16.75 |
| Overall L1 | 27.25 |
| all | 1% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 1% |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 1% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 1% |
| all | 10% |
| load | 12% |
| store | 12% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 10% |
| all | 6% |
| load | 6% |
| store | 6% |
| mul | 6% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 6% |
| all | 10% |
| load | 11% |
| store | 10% |
| mul | 6% |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 9% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOV %RCX,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| INC %RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RCX,0x1730(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV 0x3b0(%RSP,%RDX,8),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV %RDX,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| SHR $0xb,%RSI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
| MOV %ESI,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| XOR %RDX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %ESI,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| SAL $0x7,%EDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| AND $-0x62d3a980,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| XOR %RSI,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %EDX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| SAL $0xf,%ESI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (6.3%) |
| AND $-0x103a0000,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| XOR %RDX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RSI,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| SHR $0x12,%RDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| XOR %ESI,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| VCVTUSI2SS %EDX,%XMM15,%XMM0 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 6 | 1 | scal (6.3%) |
| VMULSS 0x2d03(%RIP),%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| VUCOMISS 0x2cff(%RIP),%XMM0 | 2 | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 | scal (6.3%) |
| JB 3675 <main+0x1195> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| CMP $0x270,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| JB 32b0 <main+0xdd0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| VPBROADCASTQ 0x3b0(%RSP),%YMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| VPBROADCASTQ 0x2d07(%RIP),%YMM12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| VPBROADCASTQ 0x2d06(%RIP),%YMM13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| VPBROADCASTQ 0x2d05(%RIP),%YMM14 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| VPBROADCASTQ 0x2d04(%RIP),%YMM15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VEXTRACTI128 $0x1,%YMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VPEXTRQ $0x1,%XMM0,%RSI | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (12.5%) |
| AND $-0x80000000,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0xab8(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV 0xac0(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| MOV %EDX,%EDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| AND $0x7ffffffe,%EDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| OR %RSI,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| SHR $0x1,%RDI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
| XOR 0x1718(%RSP),%RDI | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| MOV %EDX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| AND $0x1,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| NEG %ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| MOV $-0x66f74f21,%R8D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| AND %R8D,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| XOR %RDI,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RSI,0xab0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| AND $-0x80000000,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %ECX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| AND $0x7ffffffe,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| OR %RDX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| SHR $0x1,%RSI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
| XOR 0x1720(%RSP),%RSI | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| MOV %ECX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| AND $0x1,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| NEG %EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| AND %R8D,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| XOR %RSI,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %RDX,0xab8(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| AND $-0x80000000,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0xac8(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV %EDX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| VPBROADCASTQ %RDX,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| AND $0x7ffffffe,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| OR %RCX,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| SHR $0x1,%RDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| XOR 0x1728(%RSP),%RDX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| AND $0x1,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| NEG %ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| MOV $-0x66f74f21,%EDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| AND %R8D,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| XOR %RDX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RSI,0xac0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV $0xe8,%ECX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| VPBROADCASTQ 0x2b00(%RIP),%XMM5 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| VPBROADCASTQ 0x2aff(%RIP),%XMM6 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| VPBROADCASTQ 0x2afe(%RIP),%XMM7 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| VPBROADCASTQ 0x2afd(%RIP),%XMM8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV 0x1728(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| MOV $-0x80000000,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| AND %RDX,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0x3b0(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV %EDX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| AND $0x7ffffffe,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| OR %RCX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| SHR $0x1,%RSI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
| XOR 0x1010(%RSP),%RSI | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| AND $0x1,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| NEG %EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| AND %EDI,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| XOR %RSI,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %RDX,0x1728(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| JMP 32b0 <main+0xdd0> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| VMOVSS %XMM0,(%RBX,%RAX,4) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (6.3%) |
| JMP 36e3 <main+0x1203> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| MOV 0x58(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VMOVSS %XMM0,(%RDX,%RAX,4) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (6.3%) |
| JMP 3ac3 <main+0x15e3> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| MOV 0x90(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VMOVSS %XMM0,(%RDX,%RAX,4) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (6.3%) |
| INC %RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| CMP 0x50(%RSP),%RAX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| JNE 3313 <main+0xe33> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| Function | main |
| Source file and lines | attention_v2.cpp:164-167 |
| Module | attention-clang-skl256 |
| nb instructions | 107 |
| nb uops | 109 |
| loop length | 534 |
| used x86 registers | 8 |
| used mmx registers | 0 |
| used xmm registers | 6 |
| used ymm registers | 5 |
| used zmm registers | 0 |
| nb stack references | 13 |
| micro-operation queue | 27.25 cycles |
| front end | 27.25 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
|---|---|---|---|---|---|---|---|---|
| uops | 16.75 | 16.75 | 12.00 | 12.00 | 8.00 | 16.75 | 16.75 | 8.00 |
| cycles | 16.75 | 16.75 | 12.00 | 12.00 | 8.00 | 16.75 | 16.75 | 8.00 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 27.25 |
| Dispatch | 16.75 |
| Overall L1 | 27.25 |
| all | 1% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 1% |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 1% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 1% |
| all | 10% |
| load | 12% |
| store | 12% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 10% |
| all | 6% |
| load | 6% |
| store | 6% |
| mul | 6% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 6% |
| all | 10% |
| load | 11% |
| store | 10% |
| mul | 6% |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 9% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOV %RCX,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| INC %RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RCX,0x1730(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV 0x3b0(%RSP,%RDX,8),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV %RDX,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| SHR $0xb,%RSI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
| MOV %ESI,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| XOR %RDX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %ESI,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| SAL $0x7,%EDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| AND $-0x62d3a980,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| XOR %RSI,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %EDX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| SAL $0xf,%ESI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (6.3%) |
| AND $-0x103a0000,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| XOR %RDX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RSI,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| SHR $0x12,%RDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| XOR %ESI,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| VCVTUSI2SS %EDX,%XMM15,%XMM0 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 6 | 1 | scal (6.3%) |
| VMULSS 0x2d03(%RIP),%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 | scal (6.3%) |
| VUCOMISS 0x2cff(%RIP),%XMM0 | 2 | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 | scal (6.3%) |
| JB 3675 <main+0x1195> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| CMP $0x270,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| JB 32b0 <main+0xdd0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| VPBROADCASTQ 0x3b0(%RSP),%YMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| VPBROADCASTQ 0x2d07(%RIP),%YMM12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| VPBROADCASTQ 0x2d06(%RIP),%YMM13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| VPBROADCASTQ 0x2d05(%RIP),%YMM14 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| VPBROADCASTQ 0x2d04(%RIP),%YMM15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 | scal (12.5%) |
| NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| VEXTRACTI128 $0x1,%YMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | vect (25.0%) |
| VPEXTRQ $0x1,%XMM0,%RSI | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | scal (12.5%) |
| AND $-0x80000000,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0xab8(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV 0xac0(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| MOV %EDX,%EDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| AND $0x7ffffffe,%EDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| OR %RSI,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| SHR $0x1,%RDI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
| XOR 0x1718(%RSP),%RDI | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| MOV %EDX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| AND $0x1,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| NEG %ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| MOV $-0x66f74f21,%R8D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| AND %R8D,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| XOR %RDI,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RSI,0xab0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| AND $-0x80000000,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %ECX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| AND $0x7ffffffe,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| OR %RDX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| SHR $0x1,%RSI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
| XOR 0x1720(%RSP),%RSI | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| MOV %ECX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| AND $0x1,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| NEG %EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| AND %R8D,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| XOR %RSI,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %RDX,0xab8(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| AND $-0x80000000,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0xac8(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV %EDX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| VPBROADCASTQ %RDX,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | scal (12.5%) |
| AND $0x7ffffffe,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| OR %RCX,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| SHR $0x1,%RDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| XOR 0x1728(%RSP),%RDX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| AND $0x1,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| NEG %ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| MOV $-0x66f74f21,%EDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| AND %R8D,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| XOR %RDX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %RSI,0xac0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV $0xe8,%ECX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| VPBROADCASTQ 0x2b00(%RIP),%XMM5 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| VPBROADCASTQ 0x2aff(%RIP),%XMM6 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| VPBROADCASTQ 0x2afe(%RIP),%XMM7 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| VPBROADCASTQ 0x2afd(%RIP),%XMM8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV 0x1728(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| MOV $-0x80000000,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| AND %RDX,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV 0x3b0(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| MOV %EDX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| AND $0x7ffffffe,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| OR %RCX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| SHR $0x1,%RSI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
| XOR 0x1010(%RSP),%RSI | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | scal (12.5%) |
| AND $0x1,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| NEG %EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| AND %EDI,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| XOR %RSI,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %RDX,0x1728(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| JMP 32b0 <main+0xdd0> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| VMOVSS %XMM0,(%RBX,%RAX,4) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (6.3%) |
| JMP 36e3 <main+0x1203> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| MOV 0x58(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VMOVSS %XMM0,(%RDX,%RAX,4) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (6.3%) |
| JMP 3ac3 <main+0x15e3> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| MOV 0x90(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| VMOVSS %XMM0,(%RDX,%RAX,4) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (6.3%) |
| INC %RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| CMP 0x50(%RSP),%RAX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 | N/A |
| JNE 3313 <main+0xe33> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
