c - ループ内のOpenMP同期

Question

データを生成する大きなループがあります。各反復には、たとえば1秒かかり、データのチャンクが生成されます。すべてのチャンクを正しい順序でファイルに書き込む必要があります。

ループを並列化するだけの場合は、次のように記述できます（非常に単純化されています）。

    FILE* f = fopen("output.txt", "w");
    omp_lock_t lock;
    omp_init_lock(&lock);
    int nIterations = 1000000;
#pragma omp parallel for
    for(int thread=0; thread<4; thread++)
    {
        int a=0, b=0, c=0;
        for(int n=thread; n<nIterations; n+=4)
        {
            int value = do_computations(&a, &b, &c);
            omp_set_lock(&lock);
            fprintf(f, "%d\n", value);
            omp_unset_lock(&lock);
        }
    }
#pragma omp barrier
    fclose(f);
    omp_destroy_lock(&lock);

これにより、出力がファイルに取り込まれますが、エントリの順序は保証されません。

すべてのスレッドがタスクを実行し、マスタースレッドがファイルに書き込み、スレッドが再開するように、実行を同期させたいと思います。言い換えれば、私はこのようなものが欲しいです：

    #pragma omp parallel for
        for(int thread=0; thread<4; thread++)
        {
            int a=0, b=0, c=0;
            int values[4];
            for(int n=thread; n<nIterations; n+=4)
            {
                values[n] = do_computations(&a, &b, &c);
#pragma omp barrier
                if(thread == 0)
                {
                      for(int i=0; i<4; i++)
                        fprintf(f, "%d\n", values[i]);
                }
#pragma omp barrier
            }
        }
#pragma omp barrier

ただし、説明のつかない理由により、これはOpenMP仕様で禁止されています。

または私は試すことができます

    #pragma omp parallel for
        for(int thread=0; thread<4; thread++)
        {
            int a=0, b=0, c=0;
            for(int n=thread; n<nIterations; n+=4)
            {
                int value = do_computations(&a, &b, &c);
#pragma omp ordered
                {
                    fprintf(f, "%d\n", value);
                }
            }
        }
    #pragma omp barrier
        fclose(f);

ただし、「for構文を使用したループの反復は...複数の順序付きディレクティブを実行してはならない」ため、これも機能しません。

コードを単一のループとして書き直したくないし、ループを交換したくない。

他のスレッド/同期ツールを使用せずに、OpenMPでこれを行うためのクリーンな方法はありますか？

score 3 · Accepted Answer

計算と IO という 2 つのことを実行しようとしています。計算は並列化できますが、IO は必然的にシリアル化する必要があります。しかし、IO を計算と同じループに入れることで、計算にもシリアライゼーションを強制することになり、意味がありません。

すべての計算を行ってから IOを行う方がはるかに良いでしょう。これは、シリアルでもほぼ確実に高速になります。特に、fprintfs をループするのではなく、データを 1 つの大きなチャンクでバイナリに書き出すことができる場合はなおさらです。

    FILE* f = fopen("output.txt", "w");
    const int nIterations = 1000000;
    int values[nIterations];

#pragma omp parallel for 
    for(int n=0; n<niterations; n++)
    {
        int a=0, b=0, c=0;
        values[n] = do_computations(&a, &b, &c);
    }

    for (int n=0; n<niterations; n++)
         fprintf(f,"%d\n", values[n]);

    fclose(f);

もちろん、これにはより多くのメモリが必要ですが、速度とメモリは一般的なトレードオフです。そのトレードオフの極端が機能しない場合は、いつでも調整可能なサイズのチャンクで計算を行うことができます。

    const int nIterations = 1000000;
    const int chunkSize   = 10000;
    int values[chunkSize];
    int chunkNum = 0;
    int chunkLeft = chunkSize;

    for (int start = 0; start < nIterations; start+= chunkSize) {

        if (start+chunkSize > nIterations) chunkLeft = nIterations - start;

    #pragma omp parallel for 
        for(int n=start; n<start+chunkLeft; n++)
        {
            int a=0, b=0, c=0;
            values[n-start] = do_computations(&a, &b, &c);
        }

        for (int n=0; n<chunkLeft; n++)
             fprintf(f,"%d\n", values[n]);

    }
    fclose(f);

score 0 · Accepted Answer

以前の回答にはまだ存在しない解決策を提案しようとします：

#include <stdio.h>
#include <assert.h>
#include <unistd.h>

#define NITER 100

int main() {

  FILE * f = fopen("output.bin", "w+");

#pragma omp parallel
  {
#pragma omp for schedule(runtime)
    for (int ii = 0; ii < NITER; ++ii) {    
      sleep(1); // Simulate computation
      printf("%d\n",ii); // Just to be convinced that the loop is not evaluated in serial order
#pragma omp critical (FILEWRITE)
      {
    fseek (f  ,sizeof(ii)*ii,SEEK_SET);
    fwrite(&ii,sizeof(ii),1,f);
      }      
    }
  }

  // Check serially that the file is written in the right order
  fseek(f,0,SEEK_SET);
  int value = -1;
  for (int ii = 0; ii < NITER; ++ii) {        
    fread (&value,sizeof(ii),1,f);    
    assert( value == ii );
  }  

  fclose(f);
  return 0;
}

このケースは、各チャンクのサイズが非常に明確に定義されている場合にのみ適用されます。そのため、計算している反復を知っていれば、ファイルの先頭からオフセットを導き出すことができます。

とはいえ、提供しているコードスニペットには、OpenMP の基本を確認する必要があることを示唆する多くのエラーがあります。例えば：

#pragma omp parallel for
    for(int thread=0; thread<4; thread++)
    { // No need to unroll the loop as OpenMP runtime 
      // map iterations on threads based on the scheduling policy
        int a=0, b=0, c=0;
        for(int n=thread; n<nIterations; n+=4)
        {
            int value = do_computations(&a, &b, &c);
            // No need to use lock, when a critical construct suffices
            omp_set_lock(&lock); 
            fprintf(f, "%d\n", value);
            omp_unset_lock(&lock);
        }
    } // Implicit barrier at the end of the parallel for
#pragma omp barrier 
// Why a barrier when there is only one thread?

c - ループ内のOpenMP同期

3 に答える 3

Related

Reference