c++ - カスタムループが速い理由は？悪いコンパイラ？安全でないカスタムコード？運？（幸運なキャッシュヒット）

Question

アセンブリを学び始め、C-Free5.0のDigital-MarsコンパイラでC++のasm{}本体を使用して2つの変数を交換するためのカスタムループを作成しました。

-o（最適化）を有効にしました

そして結果を得ました：

 time of for-loop(cycles)        844
 time of while-loop(cycles)      735
 time of custom-loop-1(cycles)   562
 time of custom-loop-2(cycles)   469

比較するDigital-Marsコンパイラの「asmoutput」オプションが見つかりませんでした。ビルドオプションには他の最適化オプションはありません。コンパイラを変更する必要がありますか？はいの場合、どれですか？以下のコードを見て、カスタムループが高速である理由を教えてください。

これがforループの標準です。

t1=clock(); 
for(int i=0;i<200000000;i++)
{
    temp=a;//instruction 1
    a=b;//instruction 2
    b=temp;//3 instructions total   
}   
t2=clock();
printf("\n time of for-loop(increasing) %i  \n",(t2-t1));

これが標準のwhileループです。

t1=clock();
while(j<200000000)
{
    temp=a;//again it is three instructions
    a=b;
    b=temp; 
            j++;
}
t2=clock();
printf("\n time of while-loop(cycles)  %i  \n",(t2-t1));

これが私のカスタムループ1です。

t1=clock();
j=200000000;//setting the count
    __asm
    {
        pushf           //backup
        push eax        //backup
        push ebx        //backup
        push ecx        //backup
        push edx        //backup

        mov ecx,0       //init of loop range(0 to 200000000)
        mov edx,j

        do_it_again:    //begin to loop


        mov eax,a       //basic swap steps between cpu and mem(cache)
        mov ebx,b       
        mov b,eax       
        mov a,ebx       //four instructions total

        inc ecx         // j++
        cmp ecx,edx     //i<200000000  ?
        jb do_it_again  // end of loop block

        pop edx     //rolling back to history   
        pop ecx         
        pop ebx         
        pop eax         
        popf            
    }

t2=clock();
printf("\n time of custom-loop-1(cycles)   %i   \n",(t2-t1));

これが私の2番目のカスタムループです：

t1=clock();
j=200000000;//setting the count
    __asm
    {
        pushf           //backup
        push eax        
        push ebx        
        push ecx        
        push edx        

        mov ecx,0       //init of loop range(0 to 200000000)
        mov edx,j

        mov eax,a       //getting variables to registers
        mov ebx,b

        do_it_again2:   //begin to loop

        //swapping with using only 2 variables(only in cpu)
        sub eax,ebx         //a is now a-b
        add ebx,eax         //b is now a
        sub eax,ebx         //a is now -b
        xor eax,80000000h   //a is now b and four instructions total

        inc ecx         // j++
        cmp ecx,edx     //i<200000000  ?
        jb do_it_again2  // end of loop block

        pop edx         //rollback
        pop ecx         
        pop ebx         
        pop eax         
        popf            
    }

t2=clock();
printf("\n time of custom-loop-2(cycles)  %i   \n",(t2-t1));

完全なコード：

#include<stdio.h>
#include<stdlib.h>
#include<time.h>

int main()
{
int j=0;

int a=0,b=0,temp=0;

srand(time(0));
time_t t1=0;
time_t t2=0;


t1=clock(); 
for(int i=0;i<200000000;i++)
{
    temp=a;//instruction 1
    a=b;//instruction 2
    b=temp;//3 instructions total   
}   
t2=clock();
printf("\n time of for-loop(cycles) %i  \n",(t2-t1));


t1=clock();
while(j<200000000)
{
    temp=a;//again it is three instructions
    a=b;
    b=temp; 
    j++;
}
t2=clock();
printf("\n time of while-loop(cycles)  %i  \n",(t2-t1));


t1=clock();
j=200000000;//setting the count
    __asm
    {
        pushf           //backup
        push eax        //backup
        push ebx        //backup
        push ecx        //backup
        push edx        //backup

        mov ecx,0       //init of loop range(0 to 200000000)
        mov edx,j

        do_it_again:    //begin to loop


        mov eax,a       //basic swap steps between cpu and mem(cache)
        mov ebx,b       
        mov b,eax       
        mov a,ebx       //four instructions total

        inc ecx         // j++
        cmp ecx,edx     //i<200000000  ?
        jb do_it_again  // end of loop block

        pop edx     //rolling back to history   
        pop ecx         
        pop ebx         
        pop eax         
        popf            
    }

t2=clock();
printf("\n time of custom-loop-1(cycles)   %i   \n",(t2-t1));


t1=clock();
j=200000000;//setting the count
    __asm
    {
        pushf           //backup
        push eax        
        push ebx        
        push ecx        
        push edx        

        mov ecx,0       //init of loop range(0 to 200000000)
        mov edx,j

        mov eax,a       //getting variables to registers
        mov ebx,b

        do_it_again2:   //begin to loop

        //swapping with using only 2 variables(only in cpu)
        sub eax,ebx         //a is now a-b
        add ebx,eax         //b is now a
        sub eax,ebx         //a is now -b
        xor eax,80000000h   //a is now b and four instructions total

        inc ecx         // j++
        cmp ecx,edx     //i<200000000  ?
        jb do_it_again2  // end of loop block

        pop edx         //rollback
        pop ecx         
        pop ebx         
        pop eax         
        popf            
    }

t2=clock();
printf("\n time of custom-loop-2(cycles)  %i   \n",(t2-t1));

return 0;

}

私はちょうどc++とアセンブリを学んでいて、物事がどのように起こっているのか疑問に思いました。ありがとうございました

Windows XP、Pentium 4（2 GHz）Digital-Mars in C-Free

score 6 · Accepted Answer

そのコンパイラによって生成されたコードはかなりひどいものです。オブジェクトファイルをで逆アセンブルした後、最初のループobjconvに関して得られたものを次に示します。for

?_001:  cmp     dword [ebp-4H], 200000000               ; 0053 _ 81. 7D, FC, 0BEBC200
        jge     ?_002                                   ; 005A _ 7D, 17
        inc     dword [ebp-4H]                          ; 005C _ FF. 45, FC
        mov     eax, dword [ebp-18H]                    ; 005F _ 8B. 45, E8
        mov     dword [ebp-10H], eax                    ; 0062 _ 89. 45, F0
        mov     eax, dword [ebp-14H]                    ; 0065 _ 8B. 45, EC
        mov     dword [ebp-18H], eax                    ; 0068 _ 89. 45, E8
        mov     eax, dword [ebp-10H]                    ; 006B _ 8B. 45, F0
        mov     dword [ebp-14H], eax                    ; 006E _ 89. 45, EC
        jmp     ?_001                                   ; 0071 _ EB, E0

問題は、アセンブリを見たことがある人には明らかなはずです。

ループは、に入れられる値に大きく依存しますeax。これにより、次のすべての命令によってそのレジスタに依存関係が作成されるため、順不同の実行が事実上不可能になります。
6 つの汎用レジスタが利用可能ですが (ほとんどのセットアップではebpとespは実際には汎用ではないため)、コンパイラはそれらのいずれも使用せず、ローカルスタックの使用に戻ります。速度が最適化の目標である場合、これは絶対に受け入れられません。現在のループインデックスがに格納されていることもわかりますが[ebp-4H]、レジスタには簡単に格納できます。
このcmp命令は、メモリと即値オペランドを使用します。これは可能な限り低速なオペランドの組み合わせであり、パフォーマンスが問題になる場合は使用しないでください。
そして、コードサイズについて始めさせないでください。これらの指示の半分は不要です。

全体として、私が最初にすることは、できるだけ早い機会にそのコンパイラを捨てることです。しかし、繰り返しになりますが、オプションの1つとして「メモリモデル」が提供されていることを考えると、あまり期待できないようです.

score 5 · Accepted Answer

コンパイラが作成するアセンブリ言語の結果を見ずに、コンパイラが何をしているのかを推測するのは少し難しいです。VC ++ 10を使用すると、次の結果が得られます。

time of for-loop(cycles) 155

time of while-loop(cycles)  158

time of custom-loop-1(cycles)   369

time of custom-loop-2(cycles)  314

for私は出力を見ていませんでしたが、私がすぐに推測するのは、ループとループの違いwhileは単なるノイズであるということです。ただし、どちらも手書きのアセンブリコードよりも明らかにかなり高速です。

編集：アセンブリコードを見ると、私は正しかった-とのコードforはwhile同じです。次のようになります。

        call    _clock
        mov     ecx, DWORD PTR _a$[ebp]
        cdq
        mov     ebx, edx
        mov     edx, DWORD PTR _b$[ebp]
        mov     edi, eax
        mov     esi, 200000000
$LL2@main:
; Line 28
        dec     esi
; Line 30
        mov     eax, ecx
; Line 31
        mov     ecx, edx
; Line 32
        mov     edx, eax
        jne     SHORT $LL2@main
        mov     DWORD PTR _b$[ebp], edx
        mov     DWORD PTR _a$[ebp], ecx
; Line 35
        call    _clock

間違いなく2番目のループよりも「賢い」ものではありませんが、最近のCPUは単純なコードで最適に動作する傾向があります。また、ループ内の命令が少ないだけです（ループ内のメモリをまったく参照しません）。これらは決して効率の唯一の尺度ではありませんが、この単純なループでは、かなりの指標になります。

編集2：

楽しみのために、トリプルXORスワップを追加した新しいバージョンと、CPUのxchg命令を使用したバージョンを作成しました（速度などをあまり気にしない場合は、おそらく手動で作成するためです。）Intel / AMDは通常、より複雑な命令を推奨していませんが、問題を引き起こしているようには見えません。少なくとも他のものと同じくらい速く出てくるようです。

 time of for-loop(cycles) 156

 time of while-loop(cycles)  160

 time swap between register and cache  284

 time to swap using add/sub:  308

 time to swap using xchg:  155

 time to swap using triple-xor  233

ソース：

// Note: updated source -- it was just too ugly to live. Same results though.
#include<stdlib.h>
#include<time.h>
#include <iostream>
#include <string>
#include <iomanip>
#include <sstream>

namespace { 
    int a, b;
    const int loops = 200000000;
}

template <class swapper>
struct timer {
    timer(std::string const &label) { 
        clock_t t1 = clock();
        swapper()();
        clock_t t2 = clock();
        std::ostringstream buffer;
        buffer << "Time for swap using " << label;
        std::cout << std::left << std::setw(30) << buffer.str() << " = " << (t2-t1) << "\n";
    }
};

struct for_loop {
    void operator()() {
        int temp;
        for(int i=0;i<loops;i++) {
            temp=a;//instruction 1
            a=b;//instruction 2
            b=temp;//3 instructions total   
        }
    }
};

struct while_loop {
    void operator()() { 
        int j = 0;
        int temp;
        while(j<loops) {
            temp=a;//again it is three instructions
            a=b;
            b=temp; 
            j++;
        }
    }
};

struct reg_mem {
    void operator()() {
        int j=loops;//setting the count
        __asm {
            mov ecx,0       //init of loop range(0 to 200000000)
            mov edx,j
    do_it_again:    //begin to loop
            mov eax,a       //basic swap steps between cpu and mem(cache)
            mov ebx,b       
            mov b,eax       
            mov a,ebx       //four instructions total

            inc ecx         // j++
            cmp ecx,edx     //i<200000000  ?
            jb do_it_again  // end of loop block
        }
    }
};

struct add_sub {
    void operator()() { 
        int j=loops;//setting the count
        __asm {
            mov ecx,0       //init of loop range(0 to 200000000)
            mov edx,j

            mov eax,a       //getting variables to registers
            mov ebx,b

    do_it_again2:   //begin to loop

            //swapping with using only 2 variables(only in cpu)
            sub eax,ebx         //a is now a-b
            add ebx,eax         //b is now a
            sub eax,ebx         //a is now -b
            xor eax,80000000h   //a is now b and four instructions total

            inc ecx         // j++
            cmp ecx,edx     //i<200000000  ?
            jb do_it_again2  // end of loop block

            mov a, eax
            mov b, ebx
        }
    }
};

struct xchg {
    void operator()() {
        __asm {
            mov ecx, loops
            mov eax, a
            mov ebx, b
    do_it_again3:
            dec ecx
            xchg eax, ebx
            jne do_it_again3
            mov a, eax
            mov b, ebx
        }
    }
};

struct xor3 {
    void operator()() { 
        _asm { 
            mov ecx, loops
            mov eax, a
            mov edx, b
    do_swap4:
            xor eax, edx
            xor edx, eax
            xor eax, edx
            dec ecx
            jnz do_swap4

            mov a, eax
            mov b, edx
        }
    }
};

int main() {
    timer<for_loop>("for loop");
    timer<while_loop>("while loop");
    timer<reg_mem>("reg<->mem");
    timer<add_sub>("add/sub");
    timer<xchg>("xchg");
    timer<xor3>("triple xor");
    return 0;
}

結論：少なくともこの些細な作業では、まともなコンパイラを十分に気にする必要はありません（おそらく、非常に小さいコードを除いて、まったくそうではありません）。

score 3 · Accepted Answer

これはおそらく、コンパイラがオペランドを登録するのに失敗し、代わりに間接 (アドレス) オペランドに取り組んでいるという事実によるものです。

コンパイラを切り替える<-- これが最善の最適化です。

更新同じプログラム gcc intel inline assembly: test.cを翻訳するという問題を経験しました。これは、for ループと and-while ループが手書きのアセンブリよりもはるかに優れていることを明確に示しています。

とはいえ、Digital Mars を使用すると、次の処理が高速になります。

__asm
{
    xor ecx,j     //init of loop range(200000000 to 0)

    mov eax,a     //getting variables to registers
    mov ebx,b

do_it_again3: //begin to loop

    //swapping with xor idiom
    xor eax,ebx
    xor ebx,eax         
    xor eax,ebx         

    mov a,eax
    mov b,ebx

    dec ecx           // j--
    jnz do_it_again3  // end of loop block
}

使用して

XORスワップイディオム
下降ループ
暗黙の比較フラグ (とdec ecx)

Digital Mars Compiler バージョン 8.42n を使用したベンチマークの結果は次のとおりです。

time of for-loop(cycles) 572  
time of while-loop(cycles)  566  
time of custom-loop-1(cycles)   355   
time of custom-loop-2(cycles)  317   
time of custom-loop-3(cycles)  234

完全なリスト：

#include<stdio.h>
#include<stdlib.h>
#include<time.h>

int main()
{
    int j=0;

    int a=0,b=0,temp=0;

    srand(time(0));
    time_t t1=0;
    time_t t2=0;


    t1=clock();
    for(int i=0; i<200000000; i++)
    {
        temp=a;//instruction 1
        a=b;//instruction 2
        b=temp;//3 instructions total
    }
    t2=clock();
    printf("\n time of for-loop(cycles) %i  \n",(t2-t1));


    t1=clock();
    while(j<200000000)
    {
        temp=a;//again it is three instructions
        a=b;
        b=temp;
        j++;
    }
    t2=clock();
    printf("\n time of while-loop(cycles)  %i  \n",(t2-t1));


    t1=clock();
    j=200000000;//setting the count
    __asm
    {
        pushf           //backup
        push eax        //backup
        push ebx        //backup
        push ecx        //backup
        push edx        //backup

        mov ecx,0       //init of loop range(0 to 200000000)
        mov edx,j

        do_it_again:    //begin to loop


        mov eax,a       //basic swap steps between cpu and mem(cache)
        mov ebx,b
        mov b,eax
        mov a,ebx       //four instructions total

        inc ecx         // j++
        cmp ecx,edx     //i<200000000  ?
        jb do_it_again  // end of loop block

        pop edx     //rolling back to history
        pop ecx
        pop ebx
        pop eax
        popf
    }

    t2=clock();
    printf("\n time of custom-loop-1(cycles)   %i   \n",(t2-t1));

    t1=clock();
    j=200000000;//setting the count
    __asm
    {
        pushf           //backup
            push eax        
            push ebx        
            push ecx        
            push edx        

            mov ecx,0       //init of loop range(0 to 200000000)
            mov edx,j

            mov eax,a       //getting variables to registers
            mov ebx,b

            do_it_again2:   //begin to loop

            //swapping with using only 2 variables(only in cpu)
            sub eax,ebx         //a is now a-b
            add ebx,eax         //b is now a
            sub eax,ebx         //a is now -b
            xor eax,80000000h   //a is now b and four instructions total

            inc ecx         // j++
            cmp ecx,edx     //i<200000000  ?
            jb do_it_again2  // end of loop block

            pop edx         //rollback
            pop ecx         
            pop ebx         
            pop eax         
            popf            
    }

    t2=clock();
    printf("\n time of custom-loop-2(cycles)  %i   \n",(t2-t1));

    t1=clock();
    j=200000000;//setting the count
    __asm
    {
        xor ecx,j     //init of loop range(200000000 to 0)

        mov eax,a     //getting variables to registers
        mov ebx,b

    do_it_again3:   //begin to loop

        //swapping with using only 2 variables(only in cpu)
        xor eax,ebx
        xor ebx,eax         
        xor eax,ebx         

        mov a,eax
        mov b,ebx

        dec ecx         // j--
        jnz do_it_again3  // end of loop block
    }

    t2=clock();
    printf("\n time of custom-loop-3(cycles)  %i   \n",(t2-t1));

    return 0;

}

score 2 · Accepted Answer

皆さんがCコードからゼロサイクル以外のものを取得したことに驚いています。ここで、gcc 4.6.3および-O2、では、ループからの副作用がないため、ループは消えます。asmブロック以外はすべて削除されます。Digital Marsがそのような些細な最適化を実行できないとしたら、私は驚きます。Cコードを削除するさまざまな最適化スイッチを試すことができると思います。その時点で、そのような些細な比較は不可能になります。

あなたのおもちゃの例は、コンパイラの最適化を手作りのアセンブリと比較するのに役に立ちません。統計的に言えば、コンパイラーは一貫して人間よりも優れたマシンコードを書くことができます。

score 0 · Accepted Answer

これは正常なことであり、コンパイラを変更してもこの「問題」は解決されません。アセンブラーは非常に低レベルであり、すべてを制御できます。C++ コンパイラは、常に必要以上のことを行います。関数の呼び出しは、コンパイラがスタックを保護するため (たとえば)、アセンブリよりも時間がかかります。ループでも同じです。新しい変数を宣言すると時間がかかり、値も追加されます...

この質問は、より多くの情報を得るために興味深いはずです: アセンブラーが C よりも速いのはいつですか?

c++ - カスタムループが速い理由は？悪いコンパイラ？安全でないカスタムコード？運？（幸運なキャッシュヒット）

5 に答える 5

Related

Reference