c# - C＃：アトキンのふるいの実装

Question

ここの誰かが共有したいアトキンのふるいの良い実装を持っているかどうか疑問に思いました。

私はそれを実装しようとしていますが、頭を完全に包むことができません。これが私がこれまでに持っているものです。

public class Atkin : IEnumerable<ulong>
{
    private readonly List<ulong> primes;
    private readonly ulong limit;

    public Atkin(ulong limit)
    {
        this.limit = limit;
        primes = new List<ulong>();
    }

    private void FindPrimes()
    {
        var isPrime = new bool[limit + 1];
        var sqrt = Math.Sqrt(limit);

        for (ulong x = 1; x <= sqrt; x++)
            for (ulong y = 1; y <= sqrt; y++)
            {
                var n = 4*x*x + y*y;
                if (n <= limit && (n % 12 == 1 || n % 12 == 5))
                    isPrime[n] ^= true;

                n = 3*x*x + y*y;
                if (n <= limit && n % 12 == 7)
                    isPrime[n] ^= true;

                n = 3*x*x - y*y;
                if (x > y && n <= limit && n % 12 == 11)
                    isPrime[n] ^= true;
            }

        for (ulong n = 5; n <= sqrt; n++)
            if (isPrime[n])
                for (ulong k = n*n; k <= limit; k *= k)
                    isPrime[k] = false;

        primes.Add(2);
        primes.Add(3);
        for (ulong n = 5; n <= limit; n++)
            if (isPrime[n])
                primes.Add(n);
    }


    public IEnumerator<ulong> GetEnumerator()
    {
        if (!primes.Any())
            FindPrimes();


        foreach (var p in primes)
            yield return p;
    }


    IEnumerator IEnumerable.GetEnumerator()
    {
        return GetEnumerator();
    }
}

ウィキペディアにリストされている擬似コードを「翻訳」しようとしたところですが、正しく機能していません。ですから、私は何かを誤解したか、何か間違ったことをしただけです。またはおそらく両方...

テストとして使用する最初の500素数のリストがあり、実装は40番（または41番？）で失敗します。

値はインデックスで異なります[40]
期待値：179
しかし、だった：175

私の間違いを見つけることができますか、共有できる実装がありますか、またはその両方ですか？

私が使用している正確なテストは次のようになります。

public abstract class AtkinTests
{
    [Test]
    public void GetEnumerator_FirstFiveHundredNumbers_AreCorrect()
    {
        var sequence = new Atkin(2000000);
        var actual = sequence.Take(500).ToArray();
        var expected = First500;

        CollectionAssert.AreEqual(expected, actual);
    }

    private static readonly ulong[] First500 = new ulong[]
        {
            2, 3, 5, 7, 11, 13, 17, ...
        };
}

score 10 · Accepted Answer

このコード：

for (ulong k = n*n; k <= limit; k *= k)
  isPrime[k] = false;

この擬似コードの忠実な翻訳ではないようです：

is_prime(k) ← false, k ∈ {n², 2n², 3n², ..., limit}

コードは、n * n、n ^ 4、n ^ 8などで実行されるように見えます。つまり、毎回n-squaredを追加するのではなく、毎回2乗します。これを試して：

ulong nSquared = n * n;
for (ulong k = nSquared; k <= limit; k += nSquared)
  isPrime[k] = false;

score 6 · Accepted Answer

Sieve of Atkin（SoA）の翻訳されたPythonソースからのAaron Mugatroydによる最後の回答はそれほど悪くはありませんが、次のようにいくつかの重要な最適化を見逃しているため、いくつかの点で改善できます。

彼の答えは、完全なモジュロ60の元のAtkinおよびBernsteinバージョンのSieveを使用していませんが、Wikipediaの記事からの疑似コードのわずかに改善されたバリエーションを使用しているため、トグル/カリング操作を組み合わせた数値のふるい範囲の約0.36倍を使用します。以下の私のコードは、アトキンのふるいに関する私のコメントのように、適度に効率的な非ページセグメントの擬似コードを使用しています。 7分の2。
彼のコードは奇数の表現のみを持つことでバッファサイズを削減しますが、私のコードはさらにビットパックを使用して、3と5で割り切れる数、および「奇数のみ」で示される2で割り切れる数の表現を排除します。これにより、メモリ要件がさらにほぼ半分（8/15）に削減され、平均メモリアクセス時間が短縮されるため、CPUキャッシュをより有効に活用して速度をさらに向上させることができます。
私のコードは、高速ルックアップテーブル（LUT）ポップカウント手法を使用して素数をカウントし、彼が使用するビットごとの手法を使用する約1秒と比較して、カウントにほとんど時間がかかりません。ただし、このサンプルコードでは、コードの時間指定部分からわずかな時間が取られています。
最後に、私のコードは、内部ループごとに最小限のコードでビット操作操作を最適化します。たとえば、奇数の表現インデックスを生成するために継続的な右シフトを使用せず、実際には、すべての内部ループを一定のモジュロ（ビット位置に等しい）演算として書き込むことにより、少しだけシフトします。同様に、アーロンの変換されたコードは、操作において非常に非効率的です。たとえば、プライムスクエアフリーカリングでは、プライムの2乗をインデックスに追加してから、チェックを必要としないように2倍を追加するのではなく、奇数の結果をチェックします。 ; 次に、すべてのループの場合と同様に、内側のループでカリング操作を実行する前に、数値を1ずつ右にシフト（2で除算）することで、チェックも冗長にします。この非効率的なコードは 1回の操作あたりの時間のほとんどがRAMメモリアクセスで使用されるため、この「大きなふるいバッファアレイ」手法を使用すると、広い範囲の実行時間に大きな違いが生じます（10億の範囲で約37 CPUクロックサイクル以上）。、ただし、CPUキャッシュに収まるより小さな範囲の場合よりも実行時間が大幅に遅くなります。つまり、操作ごとの実行速度の下限が高すぎます。

コードは次のとおりです。

//Sieve of Atkin based on full non page segmented modulo 60 implementation...

using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;

namespace NonPagedSoA {
  //implements the non-paged Sieve of Atkin (full modulo 60 version)...
  class SoA : IEnumerable<ulong> {
    private ushort[] buf = null;
    private long cnt = 0;
    private long opcnt = 0;
    private static byte[] modPRMS = { 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 49, 53, 59, 61 };
    private static ushort[] modLUT;
    private static byte[] cntLUT;
    //initialize the private LUT's...
    static SoA() {
      modLUT = new ushort[60];
      for (int i = 0, m = 0; i < modLUT.Length; ++i) {
        if ((i & 1) != 0 || (i + 7) % 3 == 0 || (i + 7) % 5 == 0) modLUT[i] = 0;
        else modLUT[i] = (ushort)(1 << (m++));
      }
      cntLUT = new byte[65536];
      for (int i = 0; i < cntLUT.Length; ++i) {
        var c = 0;
        for (int j = i; j > 0; j >>= 1) c += j & 1;
        cntLUT[i] = (byte)c;
      }
    }
    //initialization and all the work producing the prime bit array done in the constructor...
    public SoA(ulong range) {
      this.opcnt = 0;
      if (range < 7) {
        if (range > 1) {
          cnt = 1;
          if (range > 2) this.cnt += (long)(range - 1) / 2;
        }
        this.buf = new ushort[0];
      }
      else {
        this.cnt = 3;
        var nrng = range - 7; var lmtw = nrng / 60;
        //initialize sufficient wheels to non-prime
        this.buf = new ushort[lmtw + 1];

        //Put in candidate primes:
        //for the 4 * x ^ 2 + y ^ 2 quadratic solution toggles - all x odd y...
        ulong n = 6; // equivalent to 13 - 7 = 6...
        for (uint x = 1, y = 3; n <= nrng; n += (x << 3) + 4, ++x, y = 1) {
          var cb = n; if (x <= 1) n -= 8; //cancel the effect of skipping the first one...
          for (uint i = 0; i < 15 && cb <= range; cb += (y << 2) + 4, y += 2, ++i) {
            var cbd = cb / 60; var cm = modLUT[cb % 60];
            if (cm != 0)
              for (uint c = (uint)cbd, my = y + 15; c < buf.Length; c += my, my += 30) {
                buf[c] ^= cm; // ++this.opcnt;
              }
          }
        }
        //for the 3 * x ^ 2 + y ^ 2 quadratic solution toggles - x odd y even...
        n = 0; // equivalent to 7 - 7 = 0...
        for (uint x = 1, y = 2; n <= nrng; n += ((x + x + x) << 2) + 12, x += 2, y = 2) {
          var cb = n;
          for (var i = 0; i < 15 && cb <= range; cb += (y << 2) + 4, y += 2, ++i) {
            var cbd = cb / 60; var cm = modLUT[cb % 60];
            if (cm != 0)
              for (uint c = (uint)cbd, my = y + 15; c < buf.Length; c += my, my += 30) {
                buf[c] ^= cm; // ++this.opcnt;
              }
          }
        }
        //for the 3 * x ^ 2 - y ^ 2 quadratic solution toggles all x and opposite y = x - 1...
        n = 4; // equivalent to 11 - 7 = 4...
        for (uint x = 2, y = x - 1; n <= nrng; n += (ulong)(x << 2) + 4, y = x, ++x) {
          var cb = n; int i = 0;
          for ( ; y > 1 && i < 15 && cb <= nrng; cb += (ulong)(y << 2) - 4, y -= 2, ++i) {
            var cbd = cb / 60; var cm = modLUT[cb % 60];
            if (cm != 0) {
              uint c = (uint)cbd, my = y;
              for ( ; my >= 30 && c < buf.Length; c += my - 15, my -= 30) {
                buf[c] ^= cm; // ++this.opcnt;
              }
              if (my > 0 && c < buf.Length) { buf[c] ^= cm; /* ++this.opcnt; */ }
            }
          }
          if (y == 1 && i < 15) {
            var cbd = cb / 60; var cm = modLUT[cb % 60];
            if ((cm & 0x4822) != 0 && cbd < (ulong)buf.Length) { buf[cbd] ^= cm; /* ++this.opcnt; */ }
          }
        }

        //Eliminate squares of base primes, only for those on the wheel:
        for (uint i = 0, w = 0, pd = 0, pn = 0, msk = 1; w < this.buf.Length ; ++i) {
          uint p = pd + modPRMS[pn];
          ulong sqr = (ulong)p * (ulong)p; //to handle ranges above UInt32.MaxValue
          if (sqr > range) break;
          if ((this.buf[w] & msk) != 0) { //found base prime, square free it...
            ulong s = sqr - 7;
            for (int j = 0; s <= nrng && j < modPRMS.Length; s = sqr * modPRMS[j] - 7, ++j) {
              var cd = s / 60; var cm = (ushort)(modLUT[s % 60] ^ 0xFFFF);
              //may need ulong loop index for ranges larger than two billion
              //but buf length only good to about 2^31 * 60 = 120 million anyway,
              //even with large array setting and half that with 32-bit...
              for (ulong c = cd; c < (ulong)this.buf.Length; c += sqr) {
                this.buf[c] &= cm; // ++this.opcnt;
              }
            }
          }
          if (msk >= 0x8000) { msk = 1; pn = 0; ++w; pd += 60; }
          else { msk <<= 1; ++pn; }
        }

        //clear any overflow primes in the excess space in the last wheel/word:
        var ndx = nrng % 60; //clear any primes beyond the range
        for (; modLUT[ndx] == 0; --ndx) ;
        this.buf[lmtw] &= (ushort)((modLUT[ndx] << 1) - 1);
      }
    }

    //uses a fast pop count Look Up Table to return the total number of primes...
    public long Count {
      get {
        long cnt = this.cnt;
        for (int i = 0; i < this.buf.Length; ++i) cnt += cntLUT[this.buf[i]];
        return cnt;
      }
    }

    //returns the number of toggle/cull operations used to sieve the prime bit array...
    public long Ops {
      get {
        return this.opcnt;
      }
    }

    //generate the enumeration of primes...
    public IEnumerator<ulong> GetEnumerator() {
      yield return 2; yield return 3; yield return 5;
      ulong pd = 0;
      for (uint i = 0, w = 0, pn = 0, msk = 1; w < this.buf.Length; ++i) {
        if ((this.buf[w] & msk) != 0) //found a prime bit...
          yield return pd + modPRMS[pn]; //add it to the list
        if (msk >= 0x8000) { msk = 1; pn = 0; ++w; pd += 60; }
        else { msk <<= 1; ++pn; }
      }
    }

    //required for the above enumeration...
    IEnumerator IEnumerable.GetEnumerator() {
      return this.GetEnumerator();
    }
  }

  class Program {
    static void Main(string[] args) {
      Console.WriteLine("This program calculates primes by a simple full version of the Sieve of Atkin.\r\n");

      const ulong n = 1000000000;

      var elpsd = -DateTime.Now.Ticks;

      var gen = new SoA(n);

      elpsd += DateTime.Now.Ticks;

      Console.WriteLine("{0} primes found to {1} using {2} operations in {3} milliseconds.", gen.Count, n, gen.Ops, elpsd / 10000);

      //Output prime list for testing...
      //Console.WriteLine();
      //foreach (var p in gen) {
      //  Console.Write(p + " ");
      //}
      //Console.WriteLine();

//Test options showing what one can do with the enumeration, although more slowly...
//      Console.WriteLine("\r\nThere are {0} primes with the last one {1} and the sum {2}.",gen.Count(),gen.Last(),gen.Sum(x => (long)x));

      Console.Write("\r\nPress any key to exit:");
      Console.ReadKey(true);
      Console.WriteLine();
    }
  }
}

このコードは、Aaronのコードの約2倍の速度で実行されます（i7-2700K（3.5 GHz）で64ビットまたは32ビットモードを使用して約2.7秒、バッファーは約16.5メガバイト、トグル/プライムスクエアフリーカリング操作の組み合わせは約25.8億）（「++ this.opcnt」ステートメントのコメントを外すことで表示できます）10億のふるい範囲で、カウント時間なしのコードの5.4 / 6.2秒（32ビット/ 64ビット）とほぼ比較最大10億をふるいにかけるために、約3億5900万のトグル/カリング操作を組み合わせて使用すると、メモリ使用量が2倍になります。

エラトステネスのふるい（SoE）の彼の最も最適化されたナイーブオッズのみの実装よりも高速ですが、それはエラトステネスのふるいよりもアトキンのふるいを速くすることはありません。上記のSoEplusへのSoA実装では、最大のホイール因数分解が使用され、SoEはこれとほぼ同じ速度になります。

分析： 完全に最適化されたSoEの操作数は、10億のふるい範囲でのSoAの操作数とほぼ同じですが、これらの非ページ実装の主なボトルネックは、ふるいバッファーサイズが超過した場合のメモリアクセスです。 CPUキャッシュサイズ（i7の場合、1クロックサイクルアクセスで32キロバイトのL1キャッシュ、約4クロックサイクルのアクセス時間で256キロバイトのL2キャッシュ、約20クロックサイクルのアクセス時間で8メガバイトのL3キャッシュ）。 100クロックサイクル。

アルゴリズムをページセグメンテーションに適合させると、どちらもメモリアクセス速度が約8倍向上するため、使用可能なメモリに収まらない範囲をふるいにかけることができます。ただし、数百倍に急速に拡大するカリングスキャンの大きな進歩により、アルゴリズムの「プライムスクエアフリー」部分の実装が困難なため、ふるい範囲が非常に大きくなり始めると、SoEはSoAを超えて増加し続けます。ページバッファのサイズ。同様に、そしておそらくもっと深刻なことに、各ページバッファの最も低い表現での「y」の値に関して「x」の各値の新しい開始点を計算することは、非常にメモリおよび/または計算集約的になります。範囲が拡大するにつれて、ページングされたSoAの効率がSoEと比較して大幅に低下します。

EDIT_ADD： Aaron Murgatroydが使用するオッズのみのSoEは、10億のふるい範囲で約10億2600万のカリング操作を使用するため、SoAの約4倍の操作で実行されるため、実行速度は約4倍遅くなりますが、SoAは実装されていてもここでは、より複雑な内部ループがあり、特にオッズの割合がはるかに高いため、カリングスキャンのストライドはSoAのナイーブオッズよりもはるかに短いSoEのみのSoEの平均メモリアクセス時間ははるかに優れていますふるいバッファがCPUキャッシュサイズを大幅に超えているにもかかわらず（キャッシュの関連付けをより適切に使用）。これは、理論的には作業の4分の1しか実行していないように見えるにもかかわらず、上記のSoAがオッズのみのSoEの約2倍の速度である理由を説明しています。

上記のSoAと同様のモジュロ内部ループを使用するアルゴリズムを使用し、同じ2/3/5ホイール因数分解を実装した場合、SoEはカリング操作の数を約4億5000万回に減らし、約50％多くなります。演算はSoAよりも動作し、理論的にはSoAよりもわずかに遅くなりますが、この「ナイーブな」大容量メモリバッファの使用では、平均してSoAよりもカリングストライドが少し小さいため、ほぼ同じ速度で実行される可能性があります。ホイールの因数分解を2/3/5/7ホイールに増やすと、SoEカリング操作が10億のカリング範囲で約0.314に減少し、そのバージョンのSoEがこのアルゴリズムでほぼ同じ速度で実行される可能性があります。

2/3/5/7/11/13/17/19の素因数のふるい配列を事前にカリング（パターンでコピー）することにより、実行時間のコストをほとんどかけずにホイール因数分解をさらに使用して、ふるいの範囲が10億の場合、カル操作の総数は約25.1億になり、SoEは、この最適化されたバージョンのSoAでも、これらの大容量メモリバッファーのバージョンでも、SoEにまだ多くの容量がある場合でも、より高速またはほぼ同じ速度で実行されます。上記よりもコードの複雑さが少なくなります。

したがって、SoEの操作数は、ナイーブまたはオッズのみ、または2/3/5のホイール因数分解バージョンから大幅に削減できるため、操作数はSoAの場合とほぼ同じであることがわかります。同時に、より複雑でない内部ループとより効率的なメモリアクセスの両方により、操作あたりの時間は実際には短くなる可能性があります。 END_EDIT_ADD

EDIT_ADD2： ここで、上記のSoAと同様の定数モジュロ/ビット位置手法を使用して、SoEのコードを追加します。これは、上記のリンク先の疑似コードに従って行われます。高いホイール因数分解と事前カリングが適用されているにもかかわらず、コードは上記のSoAよりもかなり複雑ではないため、実際には、選別操作までのSoAのトグル/カリング操作の合計数よりも少なくなります。約20億の。次のようなコード：

EDIT_FINAL以下の修正されたコードとそれに関連するコメントEND_EDIT_FINAL

//Sieve of Eratosthenes based on maximum wheel factorization and pre-culling implementation...

using System;
using System.Collections;
using System.Collections.Generic;

namespace NonPagedSoE {
  //implements the non-paged Sieve of Eratosthenes (full modulo 210 version with preculling)...
  class SoE : IEnumerable<ulong> {
    private ushort[] buf = null;
    private long cnt = 0;
    private long opcnt = 0;
    private static byte[] basePRMS = { 2, 3, 5, 7, 11, 13, 17, 19 };
    private static byte[] modPRMS = { 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, //positions + 23
                                      97, 101, 103, 107, 109, 113, 121, 127, 131, 137, 139, 143, 149, 151, 157, 163,
                                      167, 169, 173, 179, 181 ,187 ,191 ,193, 197, 199, 209, 211, 221, 223, 227, 229 };
    private static byte[] gapsPRMS = { 6, 2, 6, 4, 2, 4, 6, 6, 2, 6, 4, 2, 6, 4, 6, 8,
                                       4, 2, 4, 2, 4, 8, 6, 4, 6, 2, 4, 6, 2, 6, 6, 4,
                                       2, 4, 6, 2, 6, 4, 2, 4, 2, 10, 2, 10, 2, 4, 2, 4 };
    private static ulong[] modLUT;
    private static byte[] cntLUT;
    //initialize the private LUT's...
    static SoE() {
      modLUT = new ulong[210];
      for (int i = 0, m = 0; i < modLUT.Length; ++i) {
        if ((i & 1) != 0 || (i + 23) % 3 == 0 || (i + 23) % 5 == 0 || (i + 23) % 7 == 0) modLUT[i] = 0;
        else modLUT[i] = 1UL << (m++);
      }
      cntLUT = new byte[65536];
      for (int i = 0; i < cntLUT.Length; ++i) {
        var c = 0;
        for (int j = i ^ 0xFFFF; j > 0; j >>= 1) c += j & 1; //reverse logic; 0 is prime; 1 is composite
        cntLUT[i] = (byte)c;
      }
    }
    //initialization and all the work producing the prime bit array done in the constructor...
    public SoE(ulong range) {
      this.opcnt = 0;
      if (range < 23) {
        if (range > 1) {
          for (int i = 0; i < modPRMS.Length; ++i) if (modPRMS[i] <= range) this.cnt++; else break;
        }
        this.buf = new ushort[0];
      }
      else {
        this.cnt = 8;
        var nrng = range - 23; var lmtw = nrng / 210; var lmtwt3 = lmtw * 3; 
        //initialize sufficient wheels to prime
        this.buf = new ushort[lmtwt3 + 3]; //initial state of all zero's is all potential prime.

        //initialize array to account for preculling the primes of 11, 13, 17, and 19;
        //(2, 3, 5, and 7 already eliminated by the bit packing to residues).
        for (int pn = modPRMS.Length - 4; pn < modPRMS.Length; ++pn) {
          uint p = modPRMS[pn] - 210u; ulong pt3 = p * 3;
          ulong s = p * p - 23;
          ulong xrng = Math.Min(9699709, nrng); // only do for the repeating master pattern size
          ulong nwrds = (ulong)Math.Min(138567, this.buf.Length);
          for (int j = 0; s <= xrng && j < modPRMS.Length; s += p * gapsPRMS[(pn + j++) % 48]) {
            var sm = modLUT[s % 210];
            var si = (sm < (1UL << 16)) ? 0UL : ((sm < (1UL << 32)) ? 1UL : 2UL);
            var cd = s / 210 * 3 + si; var cm = (ushort)(sm >> (int)(si << 4));
            for (ulong c = cd; c < nwrds; c += pt3) { //tight culling loop for size of master pattern
              this.buf[c] |= cm; // ++this.opcnt; //reverse logic; mark composites with ones.
            }
          }
        }
        //Now copy the master pattern so it repeats across the main buffer, allow for overflow...
        for (long i = 138567; i < this.buf.Length; i += 138567)
          if (i + 138567 <= this.buf.Length)
            Array.Copy(this.buf, 0, this.buf, i, 138567);
          else Array.Copy(this.buf, 0, this.buf, i, this.buf.Length - i);

        //Eliminate all composites which are factors of base primes, only for those on the wheel:
        for (uint i = 0, w = 0, wi = 0, pd = 0, pn = 0, msk = 1; w < this.buf.Length; ++i) {
          uint p = pd + modPRMS[pn];
          ulong sqr = (ulong)p * (ulong)p;
          if (sqr > range) break;
          if ((this.buf[w] & msk) == 0) { //found base prime, mark its composites...
            ulong s = sqr - 23; ulong pt3 = p * 3;
            for (int j = 0; s <= nrng && j < modPRMS.Length; s += p * gapsPRMS[(pn + j++) % 48]) {
              var sm = modLUT[s % 210];
              var si = (sm < (1UL << 16)) ? 0UL : ((sm < (1UL << 32)) ? 1UL : 2UL);
              var cd = s / 210 * 3 + si; var cm = (ushort)(sm >> (int)(si << 4));
              for (ulong c = cd; c < (ulong)this.buf.Length; c += pt3) { //tight culling loop
                this.buf[c] |= cm; // ++this.opcnt; //reverse logic; mark composites with ones.
              }
            }
          }
          ++pn;
          if (msk >= 0x8000) { msk = 1; ++w; ++wi; if (wi == 3) { wi = 0; pn = 0; pd += 210; } }
          else msk <<= 1;
        }

        //clear any overflow primes in the excess space in the last wheel/word:
        var ndx = nrng % 210; //clear any primes beyond the range
        for (; modLUT[ndx] == 0; --ndx) ;
        var cmsk = (~(modLUT[ndx] - 1)) << 1; //force all bits above to be composite ones.
        this.buf[lmtwt3++] |= (ushort)cmsk;
        this.buf[lmtwt3++] |= (ushort)(cmsk >> 16);
        this.buf[lmtwt3] |= (ushort)(cmsk >> 32);
      }
    }

    //uses a fast pop count Look Up Table to return the total number of primes...
    public long Count {
      get {
        long cnt = this.cnt;
        for (int i = 0; i < this.buf.Length; ++i) cnt += cntLUT[this.buf[i]];
        return cnt;
      }
    }

    //returns the number of cull operations used to sieve the prime bit array...
    public long Ops {
      get {
        return this.opcnt;
      }
    }

    //generate the enumeration of primes...
    public IEnumerator<ulong> GetEnumerator() {
      yield return 2; yield return 3; yield return 5; yield return 7;
      yield return 11; yield return 13; yield return 17; yield return 19;
      ulong pd = 0;
      for (uint i = 0, w = 0, wi = 0, pn = 0, msk = 1; w < this.buf.Length; ++i) {
        if ((this.buf[w] & msk) == 0) //found a prime bit...
          yield return pd + modPRMS[pn];
        ++pn;
        if (msk >= 0x8000) { msk = 1; ++w; ++wi; if (wi == 3) { wi = 0; pn = 0; pd += 210; } }
        else msk <<= 1;
      }
    }

    //required for the above enumeration...
    IEnumerator IEnumerable.GetEnumerator() {
      return this.GetEnumerator();
    }
  }

  class Program {
    static void Main(string[] args) {
      Console.WriteLine("This program calculates primes by a simple maximually wheel factorized version of the Sieve of Eratosthenes.\r\n");

      const ulong n = 1000000000;

      var elpsd = -DateTime.Now.Ticks;

      var gen = new SoE(n);

      elpsd += DateTime.Now.Ticks;

      Console.WriteLine("{0} primes found to {1} using {2} operations in {3} milliseconds.", gen.Count, n, gen.Ops, elpsd / 10000);

//      Console.WriteLine();
//      foreach (var p in gen) {
//        Console.Write(p + " ");
//      }
//      Console.WriteLine();

      //      Console.WriteLine("\r\nThere are {0} primes with the last one {1} and the sum {2}.",gen.Count(),gen.Last(),gen.Sum(x => (long)x));

      Console.Write("\r\nPress any key to exit:");
      Console.ReadKey(true);
      Console.WriteLine();
    }
  }
}

このコードは、実際には上記のSoAよりも数パーセント高速に実行されます。これは、操作がわずかに少なく、10億の範囲のこの大きなアレイサイズの主なボトルネックは、40〜100CPUクロックサイクルのようなメモリアクセス時間です。 CPUとメモリの仕様によって異なります。これは、ほとんどの時間がメモリアクセスの待機に費やされるため、コードの最適化（操作の総数を減らす以外）は効果がないことを意味します。いずれにせよ、巨大なメモリバッファを使用することは、広い範囲をふるいにかけるための最も効率的な方法ではありません。同じ最大ホイール因数分解を使用したページセグメンテーションを使用したSoEの場合、最大で約8倍の改善が見られます。マルチプロセッシング）。

SoAの漸近的な複雑さの軽減による利益は、関連するページ処理のオーバーヘッド要因によって急速に消費されるため、SoAは、SoEと比較して40億をはるかに超える範囲で実際に不足しているのは、ページのセグメンテーションとマルチプロセッシングの実装です。プライムスクエアフリー処理とはるかに多くのページ開始アドレスの計算。あるいは、メモリ消費の莫大なコストとこれらのマーカーストア構造へのアクセスのさらなる非効率性でマーカーをRAMメモリに格納することによってこれを克服します。 END_EDIT_ADD2

要するに、SoAは、完全にホイールファクタリングされたSoEと比較して、実際には実用的なふるいではありません。漸近的な複雑さが増し、完全に最適化されたSoEにパフォーマンスが近づき始めるのと同じように、相対的なメモリアクセス時間とページセグメンテーションの複雑さ、および一般的にはより複雑で記述が難しいことに関する実際の実装の詳細。私の意見では、SoEと比較すると、実用的なふるいというよりも、興味深い知的概念と精神的運動です。

いつの日か、これらの手法をマルチスレッドページにセグメント化されたエラトステネスのふるいに適応させて、C＃でAtkinとBernsteinによる「C」でのSoAの「primegen」実装とほぼ同じ速度にし、広範囲にわたって水から吹き飛ばします。シングルスレッドでも約40億を超え、i7でマルチスレッドを使用すると最大約4の速度がさらに向上します（ハイパースレッディングを含む8つのコア）。

score 3 · Accepted Answer

これが別の実装です。BitArrayを使用してメモリを節約します。Parallel.Forには.NETFramework4が必要です。

static List<int> FindPrimesBySieveOfAtkins(int max)
{
//  var isPrime = new BitArray((int)max+1, false); 
//  Can't use BitArray because of threading issues.
    var isPrime = new bool[max + 1];
    var sqrt = (int)Math.Sqrt(max);

    Parallel.For(1, sqrt, x =>
    {
        var xx = x * x;
        for (int y = 1; y <= sqrt; y++)
        {
            var yy = y * y;
            var n = 4 * xx + yy;
            if (n <= max && (n % 12 == 1 || n % 12 == 5))
                isPrime[n] ^= true;

            n = 3 * xx + yy;
            if (n <= max && n % 12 == 7)
                isPrime[n] ^= true;

            n = 3 * xx - yy;
            if (x > y && n <= max && n % 12 == 11)
                isPrime[n] ^= true;
        }
    });

    var primes = new List<int>() { 2, 3 };
    for (int n = 5; n <= sqrt; n++)
    {
        if (isPrime[n])
        {
            primes.Add(n);
            int nn = n * n;
            for (int k = nn; k <= max; k += nn)
                isPrime[k] = false;
        }
    }

    for (int n = sqrt + 1; n <= max; n++)
        if (isPrime[n])
            primes.Add(n);

    return primes;
}

score 1 · Accepted Answer

これがSieveofAtkinのより高速な実装です。ここで、このPythonスクリプトからアルゴリズムを盗みました（アルゴリズムの功績は認めません）。

http://programmingpraxis.com/2010/02/19/sieve-of-atkin-improved/

using System;
using System.Collections;
using System.Collections.Generic;

namespace PrimeGenerator
{
    // The block element type for the bit array, 
    // use any unsigned value. WARNING: UInt64 is 
    // slower even on x64 architectures.
    using BitArrayType = System.UInt32;

    // This should never be any bigger than 256 bits - leave as is.
    using BitsPerBlockType = System.Byte;

    // The prime data type, this can be any unsigned value, the limit
    // of this type determines the limit of Prime value that can be
    // found. WARNING: UInt64 is slower even on x64 architectures.
    using PrimeType = System.Int32;

    /// <summary>
    /// Calculates prime number using the Sieve of Eratosthenes method.
    /// </summary>
    /// <example>
    /// <code>
    ///     var lpPrimes = new Eratosthenes(1e7);
    ///     foreach (UInt32 luiPrime in lpPrimes)
    ///         Console.WriteLine(luiPrime);
    /// </example>
    public class Atkin : IEnumerable<PrimeType>
    {
        #region Constants

        /// <summary>
        /// Constant for number of bits per block, calculated based on size of BitArrayType.
        /// </summary>
        const BitsPerBlockType cbBitsPerBlock = sizeof(BitArrayType) * 8;

        #endregion

        #region Protected Locals

        /// <summary>
        /// The limit for the maximum prime value to find.
        /// </summary>
        protected readonly PrimeType mpLimit;

        /// <summary>
        /// The number of primes calculated or null if not calculated yet.
        /// </summary>
        protected PrimeType? mpCount = null;

        /// <summary>
        /// The current bit array where a set bit means
        /// the odd value at that location has been determined
        /// to not be prime.
        /// </summary>
        protected BitArrayType[] mbaOddPrime;

        #endregion

        #region Initialisation

        /// <summary>
        /// Create Sieve of Atkin generator.
        /// </summary>
        /// <param name="limit">The limit for the maximum prime value to find.</param>
        public Atkin(PrimeType limit)
        {
            // Check limit range
            if (limit > PrimeType.MaxValue - (PrimeType)Math.Sqrt(PrimeType.MaxValue))
                throw new ArgumentOutOfRangeException();

            mpLimit = limit;

            FindPrimes();
        }

        #endregion

        #region Private Methods

        /// <summary>
        /// Finds the prime number within range.
        /// </summary>
        private unsafe void FindPrimes()
        {
            // Allocate bit array.
            mbaOddPrime = new BitArrayType[(((mpLimit >> 1) + 1) / cbBitsPerBlock) + 1];

            PrimeType lpYLimit, lpN, lpXX3, lpXX4, lpDXX, lpDN, lpDXX4, lpXX, lpX, lpYY, lpMinY, lpS, lpK;

            fixed (BitArrayType* lpbOddPrime = &mbaOddPrime[0])
            {
                // n = 3x^2 + y^2 section
                lpXX3 = 3;
                for (lpDXX = 0; lpDXX < 12 * SQRT((mpLimit - 1) / 3); lpDXX += 24)
                {
                    lpXX3 += lpDXX;
                    lpYLimit = (12 * SQRT(mpLimit - lpXX3)) - 36;
                    lpN = lpXX3 + 16;

                    for (lpDN = -12; lpDN < lpYLimit + 1; lpDN += 72)
                    {
                        lpN += lpDN;
                        lpbOddPrime[(lpN >> 1) / cbBitsPerBlock] ^= 
                            (BitArrayType)((BitArrayType)1 << (int)((lpN >> 1) % cbBitsPerBlock));
                    }

                    lpN = lpXX3 + 4;
                    for (lpDN = 12; lpDN < lpYLimit + 1; lpDN += 72)
                    {
                        lpN += lpDN;
                        lpbOddPrime[(lpN >> 1) / cbBitsPerBlock] ^= 
                            (BitArrayType)((BitArrayType)1 << (int)((lpN >> 1) % cbBitsPerBlock));
                    }
                }

                //    # n = 4x^2 + y^2 section
                lpXX4 = 0;
                for (lpDXX4 = 4; lpDXX4 < 8 * SQRT((mpLimit - 1) / 4) + 4; lpDXX4 += 8)
                {
                    lpXX4 += lpDXX4;
                    lpN = lpXX4 + 1;

                    if ((lpXX4 % 3) != 0)
                    {
                        for (lpDN = 0; lpDN < (4 * SQRT(mpLimit - lpXX4)) - 3; lpDN += 8)
                        {
                            lpN += lpDN;
                            lpbOddPrime[(lpN >> 1) / cbBitsPerBlock] ^= 
                                (BitArrayType)((BitArrayType)1 << (int)((lpN >> 1) % cbBitsPerBlock));
                        }
                    }
                    else
                    {
                        lpYLimit = (12 * SQRT(mpLimit - lpXX4)) - 36;
                        lpN = lpXX4 + 25;

                        for (lpDN = -24; lpDN < lpYLimit + 1; lpDN += 72)
                        {
                            lpN += lpDN;
                            lpbOddPrime[(lpN >> 1) / cbBitsPerBlock] ^= 
                                (BitArrayType)((BitArrayType)1 << (int)((lpN >> 1) % cbBitsPerBlock));
                        }

                        lpN = lpXX4 + 1;
                        for (lpDN = 24; lpDN < lpYLimit + 1; lpDN += 72)
                        {
                            lpN += lpDN;
                            lpbOddPrime[(lpN >> 1) / cbBitsPerBlock] ^= 
                                (BitArrayType)((BitArrayType)1 << (int)((lpN >> 1) % cbBitsPerBlock));
                        }
                    }
                }

                //    # n = 3x^2 - y^2 section
                lpXX = 1;
                for (lpX = 3; lpX < SQRT(mpLimit / 2) + 1; lpX += 2)
                {
                    lpXX += 4 * lpX - 4;
                    lpN = 3 * lpXX;

                    if (lpN > mpLimit)
                    {
                        lpMinY = ((SQRT(lpN - mpLimit) >> 2) << 2);
                        lpYY = lpMinY * lpMinY;
                        lpN -= lpYY;
                        lpS = 4 * lpMinY + 4;
                    }
                    else
                        lpS = 4;

                    for (lpDN = lpS; lpDN < 4 * lpX; lpDN += 8)
                    {
                        lpN -= lpDN;
                        if (lpN <= mpLimit && lpN % 12 == 11)
                            lpbOddPrime[(lpN >> 1) / cbBitsPerBlock] ^= 
                                (BitArrayType)((BitArrayType)1 << (int)((lpN >> 1) % cbBitsPerBlock));
                    }
                }

                // xx = 0
                lpXX = 0;
                for (lpX = 2; lpX < SQRT(mpLimit / 2) + 1; lpX += 2)
                {
                    lpXX += 4*lpX - 4;
                    lpN = 3*lpXX;

                    if (lpN > mpLimit)
                    {
                        lpMinY = ((SQRT(lpN - mpLimit) >> 2) << 2) - 1;
                        lpYY = lpMinY * lpMinY;
                        lpN -= lpYY;
                        lpS = 4*lpMinY + 4;
                    }
                    else
                    {
                        lpN -= 1;
                        lpS = 0;
                    }

                    for (lpDN = lpS; lpDN < 4 * lpX; lpDN += 8)
                    {
                        lpN -= lpDN;
                        if (lpN <= mpLimit && lpN % 12 == 11)
                            lpbOddPrime[(lpN>>1) / cbBitsPerBlock] ^= 
                                (BitArrayType)((BitArrayType)1 << (int)((lpN>>1) % cbBitsPerBlock));
                    }
                }

                // # eliminate squares
                for (lpN = 5; lpN < SQRT(mpLimit) + 1; lpN += 2)
                    if ((lpbOddPrime[(lpN >> 1) / cbBitsPerBlock] & ((BitArrayType)1 << (int)((lpN >> 1) % cbBitsPerBlock))) != 0)
                        for (lpK = lpN * lpN; lpK < mpLimit; lpK += lpN * lpN)
                            if ((lpK & 1) == 1)
                                lpbOddPrime[(lpK >> 1) / cbBitsPerBlock] &=
                                    (BitArrayType)~((BitArrayType)1 << (int)((lpK >> 1) % cbBitsPerBlock));
            }
        }

        /// <summary>
        /// Calculates the truncated square root for a number.
        /// </summary>
        /// <param name="value">The value to get the square root for.</param>
        /// <returns>The truncated sqrt of the value.</returns>
        private unsafe PrimeType SQRT(PrimeType value)
        {
            return (PrimeType)Math.Sqrt(value);
        }

        /// <summary>
        /// Gets a bit value by index.
        /// </summary>
        /// <param name="bits">The blocks containing the bits.</param>
        /// <param name="index">The index of the bit.</param>
        /// <returns>True if bit is set, false if cleared.</returns>
        private bool GetBitSafe(BitArrayType[] bits, PrimeType index)
        {
            if ((index & 1) == 1)
                return (bits[(index >> 1) / cbBitsPerBlock] & ((BitArrayType)1 << (int)((index >> 1) % cbBitsPerBlock))) != 0;
            else
                return false;
        }

        #endregion

        #region Public Properties

        /// <summary>
        /// Get the limit for the maximum prime value to find.
        /// </summary>
        public PrimeType Limit
        {
            get
            {
                return mpLimit;
            }
        }

        /// <summary>
        /// Returns the number of primes found in the range.
        /// </summary>
        public PrimeType Count
        {
            get
            {
                if (!mpCount.HasValue)
                {
                    PrimeType lpCount = 0;
                    foreach (PrimeType liPrime in this) lpCount++;
                    mpCount = lpCount;
                }

                return mpCount.Value;
            }
        }

        /// <summary>
        /// Determines if a value in range is prime or not.
        /// </summary>
        /// <param name="test">The value to test for primality.</param>
        /// <returns>True if the value is prime, false otherwise.</returns>
        public bool this[PrimeType test]
        {
            get
            {
                if (test > mpLimit) throw new ArgumentOutOfRangeException();
                if (test <= 1) return false;
                if (test == 2) return true;
                if ((test & 1) == 0) return false;
                return !GetBitSafe(mbaOddPrime, test >> 1);
            }
        }

        #endregion

        #region Public Methods

        /// <summary>
        /// Gets the enumerator for the primes.
        /// </summary>
        /// <returns>The enumerator of the primes.</returns>
        public IEnumerator<PrimeType> GetEnumerator()
        {
            //    return [2,3] + filter(primes.__getitem__, xrange(5,limit,2))

            // Two & Three always prime.
            yield return 2;
            yield return 3;

            // Start at first block, third MSB (5).
            int liBlock = 0;
            byte lbBit = 2;
            BitArrayType lbaCurrent = mbaOddPrime[0] >> lbBit;

            // For each value in range stepping in incrments of two for odd values.
            for (PrimeType lpN = 5; lpN <= mpLimit; lpN += 2)
            {
                // If current bit not set then value is prime.
                if ((lbaCurrent & 1) == 1)
                    yield return lpN;

                // Move to NSB.
                lbaCurrent >>= 1;

                // Increment bit value. 
                lbBit++;

                // If block is finished.
                if (lbBit == cbBitsPerBlock) 
                {
                    lbBit = 0;
                    lbaCurrent = mbaOddPrime[++liBlock];

                    //// Move to first bit of next block skipping full blocks.
                    while (lbaCurrent == 0)
                    {
                        lpN += ((PrimeType)cbBitsPerBlock) << 1;
                        if (lpN <= mpLimit)
                            lbaCurrent = mbaOddPrime[++liBlock];
                        else
                            break;
                    }
                }
            }
        }

        #endregion

        #region IEnumerable<PrimeType> Implementation

        /// <summary>
        /// Gets the enumerator for the primes.
        /// </summary>
        /// <returns></returns>
        IEnumerator IEnumerable.GetEnumerator()
        {
            return GetEnumerator();
        }

        #endregion
    }
}

エラトステネスのふるいの私の最も最適化されたバージョンに近い速度ですが、それでも約20％遅いので、ここで見つけることができます：

https://stackoverflow.com/a/9700790/738380

score 0 · Accepted Answer

ここでは、CompartmentalisedParallelというクラスを使用しています。これにより、並列forループを実行できますが、インデックスがグループ化されるようにスレッドの数を制御できます。ただし、スレッドの問題により、変更されるたびにBitArrayをロックするか、スレッドごとに個別のBitArrayを作成して、最後にそれらをXORする必要があります。最初のオプションは、ロックの量が多いため、かなり遅くなりました。オプションは私にとってより速いようでした！

using System;
using System.Collections;
using System.Collections.Generic;
using System.Threading.Tasks;

namespace PrimeGenerator
{
    public class Atkin : Primes
    {
        protected BitArray mbaPrimes;
        protected bool mbThreaded = true;

        public Atkin(int limit)
            : this(limit, true)
        {
        }

        public Atkin(int limit, bool threaded)
            : base(limit)
        {
            mbThreaded = threaded;
            if (mbaPrimes == null) FindPrimes();
        }

        public bool Threaded
        {
            get
            {
                return mbThreaded;
            }
        }

        public override IEnumerator<int> GetEnumerator()
        {
            yield return 2;
            yield return 3;
            for (int lsN = 5; lsN <= msLimit; lsN += 2)
                if (mbaPrimes[lsN]) yield return lsN;
        }

        private void FindPrimes()
        {
            mbaPrimes = new BitArray(msLimit + 1, false);

            int lsSQRT = (int)Math.Sqrt(msLimit);

            int[] lsSquares = new int[lsSQRT + 1];
            for (int lsN = 0; lsN <= lsSQRT; lsN++)
                lsSquares[lsN] = lsN * lsN;

            if (Threaded)
            {
                CompartmentalisedParallel.For<BitArray>(
                    1, lsSQRT + 1, new ParallelOptions(),
                    (start, finish) => { return new BitArray(msLimit + 1, false); },
                    (lsX, lsState, lbaLocal) =>
                    {
                        int lsX2 = lsSquares[lsX];

                        for (int lsY = 1; lsY <= lsSQRT; lsY++)
                        {
                            int lsY2 = lsSquares[lsY];

                            int lsN = 4 * lsX2 + lsY2;
                            if (lsN <= msLimit && (lsN % 12 == 1 || lsN % 12 == 5))
                                lbaLocal[lsN] ^= true;

                            lsN -= lsX2;
                            if (lsN <= msLimit && lsN % 12 == 7)
                                lbaLocal[lsN] ^= true;

                            if (lsX > lsY)
                            {
                                lsN -= lsY2 * 2;
                                if (lsN <= msLimit && lsN % 12 == 11)
                                    lbaLocal[lsN] ^= true;
                            }
                        }

                        return lbaLocal;
                    },
                    (lbaResult, start, finish) =>
                    {
                        lock (mbaPrimes) 
                            mbaPrimes.Xor(lbaResult);
                    },
                    -1
                );
            }
            else
            {
                for (int lsX = 1; lsX <= lsSQRT; lsX++)
                {
                    int lsX2 = lsSquares[lsX];

                    for (int lsY = 1; lsY <= lsSQRT; lsY++)
                    {
                        int lsY2 = lsSquares[lsY];

                        int lsN = 4 * lsX2 + lsY2;
                        if (lsN <= msLimit && (lsN % 12 == 1 || lsN % 12 == 5))
                            mbaPrimes[lsN] ^= true;

                        lsN -= lsX2;
                        if (lsN <= msLimit && lsN % 12 == 7)
                            mbaPrimes[lsN] ^= true;

                        if (lsX > lsY)
                        {
                            lsN -= lsY2 * 2;
                            if (lsN <= msLimit && lsN % 12 == 11)
                                mbaPrimes[lsN] ^= true;
                        }
                    }
                }
            }

            for (int lsN = 5; lsN < lsSQRT; lsN += 2)
                if (mbaPrimes[lsN])
                {
                    var lsS = lsSquares[lsN];
                    for (int lsK = lsS; lsK <= msLimit; lsK += lsS)
                        mbaPrimes[lsK] = false;
                }
        }
    }
}

そして、CompartmentalisedParallelクラス：

using System;
using System.Threading.Tasks;

namespace PrimeGenerator
{
    public static class CompartmentalisedParallel
    {
        #region Int

        private static int[] CalculateCompartments(int startInclusive, int endExclusive, ref int threads)
        {
            if (threads == 0) threads = 1;
            if (threads == -1) threads = Environment.ProcessorCount;
            if (threads > endExclusive - startInclusive) threads = endExclusive - startInclusive;

            int[] liThreadIndexes = new int[threads + 1];
            liThreadIndexes[threads] = endExclusive - 1;
            int liIndexesPerThread = (endExclusive - startInclusive) / threads;
            for (int liCount = 0; liCount < threads; liCount++)
                liThreadIndexes[liCount] = liCount * liIndexesPerThread;

            return liThreadIndexes;
        }

        public static void For<TLocal>(
            int startInclusive, int endExclusive,
            ParallelOptions parallelOptions,
            Func<int, int, TLocal> localInit,
            Func<int, ParallelLoopState, TLocal, TLocal> body,
            Action<TLocal, int, int> localFinally,
            int threads)
        {
            int[] liThreadIndexes = CalculateCompartments(startInclusive, endExclusive, ref threads);

            if (threads > 1)
                Parallel.For(
                    0, threads, parallelOptions,
                    (liThread, lsState) =>
                    {
                        TLocal llLocal = localInit(liThreadIndexes[liThread], liThreadIndexes[liThread + 1]);

                        for (int liCounter = liThreadIndexes[liThread]; liCounter < liThreadIndexes[liThread + 1]; liCounter++)
                            body(liCounter, lsState, llLocal);

                        localFinally(llLocal, liThreadIndexes[liThread], liThreadIndexes[liThread + 1]);
                    }
                );
            else
            {
                TLocal llLocal = localInit(startInclusive, endExclusive);
                for (int liCounter = startInclusive; liCounter < endExclusive; liCounter++)
                    body(liCounter, null, llLocal);
                localFinally(llLocal, startInclusive, endExclusive);
            }
        }

        public static void For(
            int startInclusive, int endExclusive,
            ParallelOptions parallelOptions,
            Action<int, ParallelLoopState> body,
            int threads)
        {
            int[] liThreadIndexes = CalculateCompartments(startInclusive, endExclusive, ref threads);

            if (threads > 1)
                Parallel.For(
                    0, threads, parallelOptions,
                    (liThread, lsState) =>
                    {
                        for (int liCounter = liThreadIndexes[liThread]; liCounter < liThreadIndexes[liThread + 1]; liCounter++)
                            body(liCounter, lsState);
                    }
                );
            else
                for (int liCounter = startInclusive; liCounter < endExclusive; liCounter++)
                    body(liCounter, null);
        }

        public static void For(
            int startInclusive, int endExclusive,
            ParallelOptions parallelOptions,
            Action<int> body,
            int threads)
        {
            int[] liThreadIndexes = CalculateCompartments(startInclusive, endExclusive, ref threads);

            if (threads > 1)
                Parallel.For(
                    0, threads, parallelOptions,
                    (liThread) =>
                    {
                        for (int liCounter = liThreadIndexes[liThread]; liCounter < liThreadIndexes[liThread + 1]; liCounter++)
                            body(liCounter);
                    }
                );
            else
                for (int liCounter = startInclusive; liCounter < endExclusive; liCounter++)
                    body(liCounter);
        }

        public static void For(
            int startInclusive, int endExclusive,
            Action<int, ParallelLoopState> body,
            int threads)
        {
            For(startInclusive, endExclusive, new ParallelOptions(), body, threads);
        }

        public static void For(
            int startInclusive, int endExclusive,
            Action<int> body,
            int threads)
        {
            For(startInclusive, endExclusive, new ParallelOptions(), body, threads);
        }

        public static void For<TLocal>(
            int startInclusive, int endExclusive,
            Func<int, int, TLocal> localInit,
            Func<int, ParallelLoopState, TLocal, TLocal> body,
            Action<TLocal, int, int> localFinally,
            int threads)
        {
            For<TLocal>(startInclusive, endExclusive, new ParallelOptions(), localInit, body, localFinally, threads);
        }

        #endregion

        #region Long

        private static long[] CalculateCompartments(long startInclusive, long endExclusive, ref long threads)
        {
            if (threads == 0) threads = 1;
            if (threads == -1) threads = Environment.ProcessorCount;
            if (threads > endExclusive - startInclusive) threads = endExclusive - startInclusive;

            long[] liThreadIndexes = new long[threads + 1];
            liThreadIndexes[threads] = endExclusive - 1;
            long liIndexesPerThread = (endExclusive - startInclusive) / threads;
            for (long liCount = 0; liCount < threads; liCount++)
                liThreadIndexes[liCount] = liCount * liIndexesPerThread;

            return liThreadIndexes;
        }

        public static void For<TLocal>(
            long startInclusive, long endExclusive,
            ParallelOptions parallelOptions,
            Func<long, long, TLocal> localInit,
            Func<long, ParallelLoopState, TLocal, TLocal> body,
            Action<TLocal, long, long> localFinally,
            long threads)
        {
            long[] liThreadIndexes = CalculateCompartments(startInclusive, endExclusive, ref threads);

            if (threads > 1)
                Parallel.For(
                    0, threads, parallelOptions,
                    (liThread, lsState) =>
                    {
                        TLocal llLocal = localInit(liThreadIndexes[liThread], liThreadIndexes[liThread + 1]);

                        for (long liCounter = liThreadIndexes[liThread]; liCounter < liThreadIndexes[liThread + 1]; liCounter++)
                            body(liCounter, lsState, llLocal);

                        localFinally(llLocal, liThreadIndexes[liThread], liThreadIndexes[liThread + 1]);
                    }
                );
            else
            {
                TLocal llLocal = localInit(startInclusive, endExclusive);
                for (long liCounter = startInclusive; liCounter < endExclusive; liCounter++)
                    body(liCounter, null, llLocal);
                localFinally(llLocal, startInclusive, endExclusive);
            }
        }

        public static void For(
            long startInclusive, long endExclusive,
            ParallelOptions parallelOptions,
            Action<long, ParallelLoopState> body,
            long threads)
        {
            long[] liThreadIndexes = CalculateCompartments(startInclusive, endExclusive, ref threads);

            if (threads > 1)
                Parallel.For(
                    0, threads, parallelOptions,
                    (liThread, lsState) =>
                    {
                        for (long liCounter = liThreadIndexes[liThread]; liCounter < liThreadIndexes[liThread + 1]; liCounter++)
                            body(liCounter, lsState);
                    }
                );
            else
                for (long liCounter = startInclusive; liCounter < endExclusive; liCounter++)
                    body(liCounter, null);
        }

        public static void For(
            long startInclusive, long endExclusive,
            ParallelOptions parallelOptions,
            Action<long> body,
            long threads)
        {
            long[] liThreadIndexes = CalculateCompartments(startInclusive, endExclusive, ref threads);

            if (threads > 1)
                Parallel.For(
                    0, threads, parallelOptions,
                    (liThread) =>
                    {
                        for (long liCounter = liThreadIndexes[liThread]; liCounter < liThreadIndexes[liThread + 1]; liCounter++)
                            body(liCounter);
                    }
                );
            else
                for (long liCounter = startInclusive; liCounter < endExclusive; liCounter++)
                    body(liCounter);
        }

        public static void For(
            long startInclusive, long endExclusive,
            Action<long, ParallelLoopState> body,
            long threads)
        {
            For(startInclusive, endExclusive, new ParallelOptions(), body, threads);
        }

        public static void For(
            long startInclusive, long endExclusive,
            Action<long> body,
            long threads)
        {
            For(startInclusive, endExclusive, new ParallelOptions(), body, threads);
        }

        public static void For<TLocal>(
            long startInclusive, long endExclusive,
            Func<long, long, TLocal> localInit,
            Func<long, ParallelLoopState, TLocal, TLocal> body,
            Action<TLocal, long, long> localFinally,
            long threads)
        {
            For<TLocal>(startInclusive, endExclusive, new ParallelOptions(), localInit, body, localFinally, threads);
        }

        #endregion
    }
}

素数基本クラス：

using System.Collections;
using System.Collections.Generic;

namespace PrimeGenerator
{
    public abstract class Primes : IEnumerable<int>
    {
        protected readonly int msLimit;

        public Primes(int limit)
        {
            msLimit = limit;
        }

        public int Limit
        {
            get
            {
                return msLimit;
            }
        }

        public int Count
        {
            get
            {
                int liCount = 0;
                foreach (int liPrime in this)
                    liCount++;
                return liCount;
            }
        }

        public abstract IEnumerator<int> GetEnumerator();

        IEnumerator IEnumerable.GetEnumerator()
        {
            return GetEnumerator();
        }
    }
}

次のようにして使用します。

    var lpPrimes = new Atkin(count, true);
    Console.WriteLine(lpPrimes.Count);
    Console.WriteLine(s.ElapsedMilliseconds);

ただし、Atkinのマルチスレッドモードで実行されている4コアCPUを使用している場合でも、エラトステネスの方がすべての場合に高速であることがわかりました。

using System;
using System.Collections;
using System.Collections.Generic;

namespace PrimeGenerator
{
    public class Eratosthenes : Primes
    {
        protected BitArray mbaOddEliminated;

        public Eratosthenes(int limit)
            : base(limit)
        {
            if (mbaOddEliminated == null) FindPrimes();
        }

        public override IEnumerator<int> GetEnumerator()
        {
            yield return 2;
            for (int lsN = 3; lsN <= msLimit; lsN+=2)
                if (!mbaOddEliminated[lsN>>1]) yield return lsN;
        }

        private void FindPrimes()
        {
            mbaOddEliminated = new BitArray((msLimit>>1) + 1);
            int lsSQRT = (int)Math.Sqrt(msLimit);
            for (int lsN = 3; lsN < lsSQRT + 1; lsN += 2)
                if (!mbaOddEliminated[lsN>>1])
                    for (int lsM = lsN*lsN; lsM <= msLimit; lsM += lsN<<1)
                        mbaOddEliminated[lsM>>1] = true;
        }
    }
}

Atkinをより速く実行するようになったら、私に知らせてください！

score 0 · Accepted Answer

これは、カスタムFixBitArraysと安全でないコードを使用したエラトステネスのふるいの改善です。これは以前のエラトステネスアルゴリズムよりも約225％速く、クラスはスタンドアロンです（これはマルチスレッドではありません-エラトステネスはマルチスレッドと互換性がありません）、 AMD Phenom II X4 965プロセッサでは、9,261ミリ秒でプライムを1,000,000,000の制限まで計算できます。

using System;
using System.Collections;
using System.Collections.Generic;

namespace PrimeGenerator
{
    // The block element type for the bit array, 
    // use any unsigned value. WARNING: UInt64 is 
    // slower even on x64 architectures.
    using BitArrayType = System.UInt32;

    // This should never be any bigger than 256 bits - leave as is.
    using BitsPerBlockType = System.Byte;

    // The prime data type, this can be any unsigned value, the limit
    // of this type determines the limit of Prime value that can be
    // found. WARNING: UInt64 is slower even on x64 architectures.
    using PrimeType = System.UInt32;

    /// <summary>
    /// Calculates prime number using the Sieve of Eratosthenes method.
    /// </summary>
    /// <example>
    /// <code>
    ///     var lpPrimes = new Eratosthenes(1e7);
    ///     foreach (UInt32 luiPrime in lpPrimes)
    ///         Console.WriteLine(luiPrime);
    /// </example>
    public class Eratosthenes : IEnumerable<PrimeType>
    {
        #region Constants

        /// <summary>
        /// Constant for number of bits per block, calculated based on size of BitArrayType.
        /// </summary>
        const BitsPerBlockType cbBitsPerBlock = sizeof(BitArrayType) * 8;

        #endregion

        #region Protected Locals

        /// <summary>
        /// The limit for the maximum prime value to find.
        /// </summary>
        protected readonly PrimeType mpLimit;

        /// <summary>
        /// The current bit array where a set bit means
        /// the odd value at that location has been determined
        /// to not be prime.
        /// </summary>
        protected BitArrayType[] mbaOddNotPrime;

        #endregion

        #region Initialisation

        /// <summary>
        /// Create Sieve of Eratosthenes generator.
        /// </summary>
        /// <param name="limit">The limit for the maximum prime value to find.</param>
        public Eratosthenes(PrimeType limit)
        {
            // Check limit range
            if (limit > PrimeType.MaxValue - (PrimeType)Math.Sqrt(PrimeType.MaxValue))
                throw new ArgumentOutOfRangeException();

            mpLimit = limit;

            FindPrimes();
        }

        #endregion

        #region Private Methods

        /// <summary>
        /// Finds the prime number within range.
        /// </summary>
        private unsafe void FindPrimes()
        {
            // Allocate bit array.
            mbaOddNotPrime = new BitArrayType[(((mpLimit >> 1) + 1) / cbBitsPerBlock) + 1];

            // Cache Sqrt of limit.
            PrimeType lpSQRT = (PrimeType)Math.Sqrt(mpLimit);

            // Fix the bit array for pointer access
            fixed (BitArrayType* lpbOddNotPrime = &mbaOddNotPrime[0])
                // Scan primes up to lpSQRT
                for (PrimeType lpN = 3; lpN <= lpSQRT; lpN += 2)
                    // If the current bit value for index lpN is cleared (prime)
                    if (
                            (
                                lpbOddNotPrime[(lpN >> 1) / cbBitsPerBlock] & 
                                ((BitArrayType)1 << (BitsPerBlockType)((lpN >> 1) % cbBitsPerBlock))
                            ) == 0
                        )
                        // Leave it cleared (prime) and mark all multiples of lpN*2 from lpN*lpN as not prime
                        for (PrimeType lpM = lpN * lpN; lpM <= mpLimit; lpM += lpN << 1)
                            // Set as not prime
                            lpbOddNotPrime[(lpM >> 1) / cbBitsPerBlock] |= 
                                (BitArrayType)((BitArrayType)1 << (BitsPerBlockType)((lpM >> 1) % cbBitsPerBlock));
        }

        /// <summary>
        /// Gets a bit value by index.
        /// </summary>
        /// <param name="bits">The blocks containing the bits.</param>
        /// <param name="index">The index of the bit.</param>
        /// <returns>True if bit is set, false if cleared.</returns>
        private bool GetBitSafe(BitArrayType[] bits, PrimeType index)
        {
            return (bits[index / cbBitsPerBlock] & ((BitArrayType)1 << (BitsPerBlockType)(index % cbBitsPerBlock))) != 0;
        }

        #endregion

        #region Public Properties

        /// <summary>
        /// Get the limit for the maximum prime value to find.
        /// </summary>
        public PrimeType Limit
        {
            get
            {
                return mpLimit;
            }
        }

        /// <summary>
        /// Returns the number of primes found in the range.
        /// </summary>
        public PrimeType Count
        {
            get
            {
                PrimeType lptCount = 0;
                foreach (PrimeType liPrime in this)
                    lptCount++;
                return lptCount;
            }
        }

        /// <summary>
        /// Determines if a value in range is prime or not.
        /// </summary>
        /// <param name="test">The value to test for primality.</param>
        /// <returns>True if the value is prime, false otherwise.</returns>
        public bool this[PrimeType test]
        {
            get
            {
                if (test > mpLimit) throw new ArgumentOutOfRangeException();
                if (test <= 1) return false;
                if (test == 2) return true;
                if ((test & 1) == 0) return false;
                return !GetBitSafe(mbaOddNotPrime, test >> 1);
            }
        }

        #endregion

        #region Public Methods

        /// <summary>
        /// Gets the enumerator for the primes.
        /// </summary>
        /// <returns>The enumerator of the primes.</returns>
        public IEnumerator<PrimeType> GetEnumerator()
        {
            // Two always prime.
            yield return 2;

            // Start at first block, second MSB.
            int liBlock = 0;
            byte lbBit = 1;
            BitArrayType lbaCurrent = mbaOddNotPrime[0] >> 1;

            // For each value in range stepping in incrments of two for odd values.
            for (PrimeType lpN = 3; lpN <= mpLimit; lpN += 2)
            {
                // If current bit not set then value is prime.
                if ((lbaCurrent & 1) == 0)
                    yield return lpN;

                // Move to NSB.
                lbaCurrent >>= 1;

                // Increment bit value.
                lbBit++;

                // If block is finished.
                if (lbBit == cbBitsPerBlock) 
                {
                    // Move to first bit of next block.
                    lbBit = 0;
                    liBlock++;
                    lbaCurrent = mbaOddNotPrime[liBlock];
                }
            }
        }

        #endregion

        #region IEnumerable<PrimeType> Implementation

        /// <summary>
        /// Gets the enumerator for the primes.
        /// </summary>
        /// <returns>The enumerator for the prime numbers.</returns>
        IEnumerator IEnumerable.GetEnumerator()
        {
            return GetEnumerator();
        }

        #endregion
    }
}

1,000,000,000で見つかった素数：9,261ミリ秒で50,847,534

c# - C＃：アトキンのふるいの実装

6 に答える 6

Related

Reference