c++ - C++インラインアセンブリでこれに対するより良い解決策を提案できますか?

Question

私はアセンブリを学んでおり、Digital-Mars C++ コンパイラ (intel sytanx の方が読みやすい) 内で SSE および MMX レジスタの実験を開始しました。var_1 を値として取り、それを var_2 数値システムに変換するプログラムを完成させました (これは今のところ 8 ビットです。後で 32 64 128 に拡張します)。プログラムは、次の 2 つの方法でこれを行います。

__asmインライン化
%(modulo) 演算子の通常の C++ の方法。

質問: xmm0-7 および mm0-7 レジスターを使用するより効率的な方法と、それらの正確なバイトを al,ah... 8 ビットレジスターと交換する方法を教えてください。

C++ の通常の %(modulo) 演算子は、私のコンピューター (pentium-m centrino 2.0GHz) の __asm と比較して非常に遅いです。__asmmで除算命令をなくす方法を教えていただけるとさらに早くなります。

プログラムを実行すると、次のようになります。

(for the values: var_1=17,var_2=2,all loops are 200M times)

17 is 10001 in number system 2
__asm(clock)...........: 7250    <------too bad. it is 8-bit calc.
C++(clock).............: 12250   <------not very slow(var_2 is a power of 2)


(for the values: var_1=33,var_2=7,all loops are 200M times)
33 is 45 in number system 7
 __asm(clock)..........: 2875   <-------not good. it is 8-bit calc.
 C++(clock)............: 6328   <----------------really slow(var_2 is not a power of 2)

2 番目の C++ コード (% 演算子を含むもの): /////////////////////////////////// ///////////////////

t1=clock();//reference time
for(int i=0;i<200000000;i++)
{
    y=x;
    counter=0;
    while(y>g)
    {   

        var_3[counter]=y%g;
        y/=g;
        counter++;
    }

     var_3[counter]=y%g;
}   
t2=clock();//final time

_asm コード://///////////////////////////////////////// /////////////////////////////////////////////// ///////////

     __asm  // i love assembly in some parts of C++
        {

        pushf   //here does register backup
        push eax
        push ebx
        push ecx
        push edx
        push edi

            mov eax,0h      //this will be outer loop counter init to zero
            //init of medium-big registers to zero
            movd xmm0,eax    //cannot set to immediate constant: xmm0=outer loop counter 
            shufps xmm0,xmm0,0h //this makes all bits zero
            movd xmm1,eax
            movd xmm2,eax   
            shufps xmm1,xmm1,0h
            shufps xmm2,xmm2,0h
            movd xmm2,eax 
            shufps xmm3,xmm3,0h//could have made pxor xmm3,xmm3(single instruction)
            //init complete(xmm0,xmm1,xmm2,xmm3 are zero)

            movd xmm1,[var_1] //storing variable_1 to register
            movd xmm2,[var_2] //storing var_2 to register    
            lea ebx,var_3     //calculate var_3 address
            movd xmm3,ebx     //storing var_3's address to register
            for_loop:
            mov eax,0h      
            //this line is index-init to zero(digit array index)
            movd edx,xmm2
            mov cl,dl       //this is the var_1 stored in cl
            movd edx,xmm1
            mov al,dl       //this is the var_2 stored in al
            mov edx,0h
            dng:
                mov ah,00h      //preparation for a 8-bit division
                div cl          //divide

                movd ebx,xmm3   //get var_3 address
                add ebx,edx     //i couldnt find a way to multiply with 4
                add ebx,edx     //so i added 4 times ^^
                add ebx,edx     //add   
                add ebx,edx     //last adding
                //below, mov [ebx],ah is the only memory accessing instruction
                mov [ebx],ah    //(8 bit)this line is equivalent to var_3[i]=remainder


                inc edx         //i++;
                cmp al,00h      //is division zero?
            jne dng             //if no, loop again

            //here edi register has the number of digits

            movd eax,xmm0       //get the outer loop counter from medium-big register
            add eax,01h         //j++;
            movd xmm0,eax       //store the new counter to medium-big register
            cmp eax,0BEBC200h           //is j<(200,000,000) ?
            jb for_loop     //if yes, go loop again
            mov [var_3_size],edx //now we have number of digits too!
         //here does registers revert back to old values
        pop edi
        pop edx
        pop ecx
        pop ebx
        pop eax
        popf     

        }

コード全体://///////////////////////////////////////// ////////////////////////////////////////

#include <iostream.h>
#include <cmath>
#include<stdlib.h>
#include<stdio.h>
#include<time.h>
int main()
    {

    srand(time(0));


    clock_t t1=clock();
    clock_t t2=clock();

    int var_1=17;  //number itself
    int var_2=2;   //number system
    int var_3[100];  //digits to be showed(maximum 100 as seen )
    int var_3_size=0;//asm block will decide what will the number of  digits be

    for(int i=0;i<100;i++)
    {
    var_3[i]=0; //here we initialize digits to zeroes
    }


    t1=clock();//reference time to take
     __asm  // i love assembly in some parts of C++
        {

        pushf   //here does register backup
        push eax
        push ebx
        push ecx
        push edx
        push edi

            mov eax,0h      //this will be outer loop counter init to zero
            //init of medium-big registers to zero
            movd xmm0,eax    //cannot set to immediate constant: xmm0=outer loop counter 
            shufps xmm0,xmm0,0h //this makes all bits zero
            movd xmm1,eax
            movd xmm2,eax   
            shufps xmm1,xmm1,0h
            shufps xmm2,xmm2,0h
            movd xmm2,eax 
            shufps xmm3,xmm3,0h
            //init complete(xmm0,xmm1,xmm2,xmm3 are zero)

            movd xmm1,[var_1] //storing variable_1 to register
            movd xmm2,[var_2] //storing var_2 to register    
            lea ebx,var_3     //calculate var_3 address
            movd xmm3,ebx     //storing var_3's address to register
            for_loop:
            mov eax,0h      
            //this line is index-init to zero(digit array index)
            movd edx,xmm2
            mov cl,dl       //this is the var_1 stored in cl
            movd edx,xmm1
            mov al,dl       //this is the var_2 stored in al
            mov edx,0h
            dng:
                mov ah,00h      //preparation for a 8-bit division
                div cl          //divide

                movd ebx,xmm3   //get var_3 address
                add ebx,edx     //i couldnt find a way to multiply with 4
                add ebx,edx     //so i added 4 times ^^
                add ebx,edx     //add   
                add ebx,edx     //last adding
                //below, mov [ebx],ah is the only memory accessing instruction
                mov [ebx],ah    //(8 bit)this line is equivalent to var_3[i]=remainder


                inc edx         //i++;
                cmp al,00h      //is division zero?
            jne dng             //if no, loop again

            //here edi register has the number of digits

            movd eax,xmm0       //get the outer loop counter from medium-big register
            add eax,01h         //j++;
            movd xmm0,eax       //store the new counter to medium-big register
            cmp eax,0BEBC200h           //is j<(200,000,000) ?
            jb for_loop     //if yes, go loop again
            mov [var_3_size],edx //now we have number of digits too!
         //here does registers revert back to old values
        pop edi
        pop edx
        pop ecx
        pop ebx
        pop eax
        popf     

        }
    t2=clock(); //finish time
    printf("\n assembly_inline(clocks): %i  for the 200 million calculations",(t2-t1)); 

        printf("\n value %i(in decimal) is: ",var_1);
for(int i=var_3_size-1;i>=0;i--)
{
    printf("%i",var_3[i]);
}
printf(" in the number system: %i \n",var_2);




//and: more readable form(end easier)
    int counter=var_3_size;
    int x=var_1;
    int g=var_2;
    int y=x;// backup
t1=clock();//reference time

for(int i=0;i<200000000;i++)
{
    y=x;
    counter=0;
    while(y>g)
    {   

        var_3[counter]=y%g;
        y/=g;
        counter++;
    }

     var_3[counter]=y%g;
}

t2=clock();//final time
printf("\n C++(clocks): %i  for the 200 million calculations",(t2-t1)); 

printf("\n value %i(in decimal) is: ",x);
for(int i=var_3_size-1;i>=0;i--)
{
    printf("%i",var_3[i]);
}
printf(" in the number system: %i \n",g);
return 0;

}

編集：これは32ビット版です

    void get_digits_asm()
{
    __asm
    {

        pushf       //couldnt store this in other registers 
        movd xmm0,eax//storing in xmm registers instead of pushing
        movd xmm1,ebx//
        movd xmm2,ecx//
        movd xmm3,edx//
        movd xmm4,edi//end of push backups

        mov eax,[variable_x]
        mov ebx,[number_system]
        mov ecx,0h
        mov edi,0h

        begin_loop:
        mov edx,0h
        div ebx             
        lea edi,digits  
        mov [edi+ecx*4],edx
        add ecx,01h
        cmp eax,ebx
        ja begin_loop

        mov edx,0
        div ebx
        lea edi,digits
        mov [edi+ecx*4],edx
        inc ecx
        mov [digits_total],ecx


        movd edi,xmm4//pop edi
        movd edx,xmm3//pop edx
        movd ecx,xmm2//pop ecx
        movd ebx,xmm1//pop ebx
        movd eax,xmm0//pop eax
        popf            
    }

}

score 1 · Accepted Answer

もちろん、コードははるかに単純になります: (C++ バージョンをモデルにしており、プッシュとポップは含まれておらず、テストもされていません)

  mov esi,200000000
_bigloop:
  mov eax,[y]
  mov ebx,[g]
  lea edi,var_3
  ; eax = y
  ; ebx = g
  ; edi = var_3
  xor ecx,ecx
  ; ecx = counter
_loop:
  xor edx,edx
  div ebx
  mov [edi+ecx*4],edx
  add ecx,1
  test eax,eax
  jnz _loop
  sub esi,1
  jnz _bigloop

しかし、C++ バージョンよりも高速であった場合は驚くでしょう。実際、基数が 2 の累乗である場合は、ほぼ確実に遅くなります。健全なコンパイラはすべて、除算および/または剰余をのべき乗で変換する方法を知っています。 2 つをビットシフトとビットごとの AND に変換します。

これは ab 8 ビット除算を使用するバージョンです。同様の警告が適用されますが、除算がオーバーフローする可能性があります (y / gが 255 を超える場合)。

  mov esi,200000000
_bigloop:
  mov eax,[y]
  mov ebx,[g]
  lea edi,var_3
  ; eax = y
  ; ebx = g
  ; edi = var_3
  xor ecx,ecx
  ; ecx = counter
_loop:
  div bl
  mov [edi+ecx],ah
  add ecx,1
  and eax,0xFF
  jnz _loop
  sub esi,1
  jnz _bigloop

c++ - C++インラインアセンブリでこれに対するより良い解決策を提案できますか?

1 に答える 1

Related

Reference