c - C 最適化手法

Question

アプリケーションのボトルネックのように見えるため、アセンブリに入ることなく、以下の関数から最速の最適化を探しています。次の関数は既にインラインで宣言されていることに注意してください。

定義: P = 10 および N = 240

void autocorrelation( int32_t *data , float *r){
    for ( int m=0 ; m < P+1 ; m++)
    {
        register float temp = 0;
        for ( int n=0 ; n<N-m ; n++)
        {
            temp += (float)(data[n])*(float)(data[n+m]);
        }
        r[m] = temp;
    }
}

どんな助けでも大歓迎です。

ありがとう！

編集：

組み立て：

temp += (float)(data[n])*(float)(data[n+m]);
800063A8  lddsp R8, SP[0x0]      
800063AA  add R1, R2, R8<<0      
800063AE  ld.w R12, R1[R7<<0]        
800063B2  mcall 0x80006f58       
800063B6  mov R4, R12        
800063B8  ld.w R12, R2[R7<<0]        
800063BC  mcall 0x80006f58       
800063C0  mov R11, R12       
800063C2  mov R12, R4        
800063C4  mcall 0x80006f5c       
800063C8  mov R11, R12       
800063CA  mov R12, R5        
800063CC  mcall 0x80006f60       
800063D0  mov R5, R12        
for ( int n=0 ; n<N-m ; n++)
800063D2  sub R6, -1         
800063D4  sub R7, -4         
800063D6  cp.w R6, R3        
800063D8  brne 0x800063ae        
r[m] = temp;
800063DA  ld.w R10, PC[2954]         
800063DE  lddsp R9, SP[0x0]      
800063E0  st.w R10[R9<<0], R5        
800063E4  sub R0, 1      
800063E6  sub R9, -4         
800063E8  stdsp SP[0x0], R9      
for ( int m=0 ; m < P+1 ; m++)
800063EA  cp.w R0, 229       
800063EE  breq 0x800063fc        
800063F0  mov R3, R0         
for ( int n=0 ; n<N-m ; n++)
800063F2  cp.w R0, 0         
800063F4  brgt 0x800063a2        
800063F8  mov R5, 0      
800063FA  rjmp 0x800063da

/////////////////////////////////////////////// /////////////////////////////

そこで、コードを次のように変更しました。

void autocorrelation( float *data , float *r){
    for ( int m=0 ; m < P+1 ; m++)
    {
        register float temp = 0;
        for ( int n=0 ; n<N-m ; n++)
        {
            temp += data[n]*data[n+m];
        }
        r[m] = temp;
    }
}

時間を 3 分の 1 に短縮 (各ティックは 1/16000Hz) - 元々 - 108 ティック - 現在は 70 ティック

新しいアセンブリ:

temp += data[n]*data[n+m];
800063C2  add R2, R3, R0<<0      
800063C6  ld.w R11, R3[R7<<0]        
800063CA  ld.w R12, R2[R7<<0]        
800063CE  mcall 0x80006f68       
800063D2  mov R11, R12       
800063D4  mov R12, R5        
800063D6  mcall 0x80006f6c       
800063DA  mov R5, R12        
for ( int n=0 ; n<N-m ; n++)
800063DC  sub R6, -1         
800063DE  sub R7, -4         
800063E0  cp.w R6, R4        
800063E2  brne 0x800063c6        
r[m] = temp;
800063E4  ld.w R9, PC[2960]      
800063E8  st.w R9[R0<<0], R5         
800063EC  sub R1, 1      
800063EE  sub R0, -4         
for ( int m=0 ; m < P+1 ; m++)
800063F0  cp.w R1, 229       
800063F4  breq 0x80006402        
800063F6  mov R4, R1         
for ( int n=0 ; n<N-m ; n++)
800063F8  cp.w R1, 0         
800063FA  brgt 0x800063bc        
800063FE  mov R5, 0      
80006400  rjmp 0x800063e4

/////////////////////////////////////////////// //// FINAL 最終解: (再度変更)

マークされたソリューションを組み合わせて、作成したアプリケーションでループを巻き戻し、最後まで 64 ビットにとどまり、パフォーマンスが最後から 60 ティックから 20 ティックに向上しました。同じ微調整で 6 つの関数を超えると、250 ティックで最適化されたように見えるコードの最初から 50 ティックまで引き出すことができました。ピンポンバッファーでは 160 ティック以内にすべてを完了する必要がありました。部屋：

void fastAutocorrelation( int64_t *data , float *r){

int64_t *temp;
int64_t *datan = data;
int64_t *datanm = data;

*temp = (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*r++ = (float)(*temp)/int64ToFloat;

datan = data;
datanm = data + 1;
*temp = (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
...

score 3 · Accepted Answer

プロセッサに浮動小数点機能がないことに注意してください。浮動小数点演算はソフトウェアでエミュレートされています。これは、ボトルネックがループ制御ではないことを意味します (コンパイラーは既に強度削減の良い仕事をしています)。ボトルネックは浮動小数点エミュレータです。

プロセッサにネイティブの浮動小数点がない場合、大規模な L1 キャッシュも搭載されていない可能性があります。ループコントロールの順序を変更すると、データの局所性が向上する場合があります。元のコードは 240 要素の配列全体で 10 回のスイープを行いますが、これは局所性が低くなります。一度に 10 個の項目を調べて、アレイ全体を 1 回掃引する方がよいでしょう。

void autocorrelation( int32_t *data , float *r){
  int m, n;
  for (m = 0; m < P + 1; m++) r[m] = 0.0f;
  for (n = 0; n < N; n++) {
    int limit = min(P + 1, N - n);
    for (m = 0; m < limit; m++) {
      r[m] += data[n] * data[n+m];
    }
  }
}

(元のコードは既にコンパイラーによってポインターを使用するように最適化されているため、ポインターへの変換は役に立たないことに注意してください。)

score 1 · Accepted Answer

冒険的な実験をしたい場合は、Intel の ICC コンパイラ (x86 アセンブラでしたか?) を試してみてください。ループの反復ごとに個別のスレッドを使用して for ループを自動的に並列化する適切なコマンドラインスイッチを使用します。ただし、スレッドのオーバーヘッドが価値あるものになるには、ループがかなり充実している必要があります。

もう 1 つのアプローチは、SSE と AVX に取り掛かることです。さらに良いことに、最新の Intel x86 には、相関や FFT などに必要な乗算/加算命令があると思います。そうすれば、コードをベクトル化し、クロックサイクルごとに複数の演算が行われます (通常の処理を超えて)。 CPU パイプラインで実現可能)。事実上、SSE/AVX オペコードに直接マップするいくつかの追加の「関数」があり、C コードで簡単に使用できます。コンパイラはこれらについて知る必要があります (Intel のものは確かにそうです)。そうしないと、独自のオンラインアセンブラを使用することになります。また、実行時に異なるバージョンの CPU を処理するという問題もあります。すべての PC に最新の Intel チップが搭載されているわけではありません。

または、私のように怠け者で、Intel の IPP/MKL のような事前に最適化されたルーチンのライブラリを使用することもできます。ICC と同様に費用がかかりますが、プロジェクトで速度 = 大金が必要な場合は、十分に価値があります。

score 0 · Accepted Answer

やってみました：

r1)配列にインデックスを付ける代わりにポインタを進めますか? ( @paddy がdata_nandに対して行ったようにdata_m)。たとえば*(r++) = temp、代わりにr[m] = temp

2) ループ展開。コンパイラは明らかにこれを行いません。これにより、たとえば、内部ループの速度が大幅に向上します。具体的には、http: //www.quickiwiki.com/en/Duff 's_device を参照して、適切なループ展開を確認してください。

3) ループのアンローリングを極限まで！(きれいではないかもしれませんが、それでもクールです): andの値を知っています: ループを完全にアンロールし、分岐なしですべてのコード (長いものではありますが) を書くことができます。アーキテクチャによっては (十分なプリフェッチパイプラインとダム分岐予測が組み合わされている)、これにより速度が大幅に向上する可能性があります。完全に展開されたループを生成する小さなユーティリティを作成し、生成されたコードをファイルに含めることもできます。または、マクロとメタプログラミングを使用してください。NP.c

c - C 最適化手法

4 に答える 4

Related

Reference