c++ - C++ で基本的な 128 ビット整数計算を行う効率的な方法は?

Question

数年前、Cuda で基本的な 128 ビット整数演算を行う方法が必要でした: cuda で 128 ビット整数? . 今、私は同じ問題を抱えていますが、今回は、128 ビットをサポートしていない 32 ビット組み込みシステム (Intel Edison) で基本的な 128 ビット演算 (合計、ビットシフト、および乗算) を実行する必要があります。ただし、直接サポートされている 64 ビット整数 (unsigned long long int) があります。

前回回答いただいたasmコードをCPUで素朴に使ってみたのですが、エラーが多発してしまいました。私は本当にasmの経験がないので、64ビット整数を使用して、128ビットで加算、乗算、ビットシフトを実装する最も効率的な方法は何ですか?

score 10 · Accepted Answer

更新: OP はまだ回答を受け入れていないので <hint><hint>、もう少しコードを添付しました。

上記のライブラリを使用することは、おそらく良い考えです。今は数個の関数しか必要ないかもしれませんが、最終的にはもう 1 つ必要になるかもしれません。そのあとにもう一つ。最終的には、独自の 128 ビット数学ライブラリを作成、デバッグ、および保守することになります。これは時間と労力の無駄です。

それは言った。自分でロールすることに決めた場合：

1）あなたが以前に尋ねたcudaの質問には、すでに乗算用のCコードがあります。何か問題はありましたか？

2) シフトはおそらく asm を使用してもメリットがないため、ここでも ac ソリューションが理にかなっています。ここでパフォーマンスが本当に問題になる場合は、Edison が SHLD/SHRD をサポートしているかどうかを確認します。これにより、これが少し速くなる可能性があります。そうでなければ、m多分このようなアプローチですか？

my_uint128_t lshift_uint128 (const my_uint128_t a, int b)
{
   my_uint128_t res;
   if (b < 32) {    
      res.x = a.x << b;
      res.y = (a.y << b) | (a.x >> (32 - b));
      res.z = (a.z << b) | (a.y >> (32 - b));
      res.w = (a.w << b) | (a.z >> (32 - b));
   } elseif (b < 64) {
      ...
   }

   return res;
}

更新: Edison は SHLD/SHRD をサポートしている可能性があるため、上記の「c」コードよりもパフォーマンスが高い可能性がある別の方法を次に示します。より高速であると主張するすべてのコードと同様に、テストする必要があります。

inline
unsigned int __shld(unsigned int into, unsigned int from, unsigned int c)
{
   unsigned int res;

   if (__builtin_constant_p(into) &&
       __builtin_constant_p(from) &&
       __builtin_constant_p(c))
   {
      res = (into << c) | (from >> (32 - c));
   }
   else
   {
      asm("shld %b3, %2, %0"
          : "=rm" (res)
          : "0" (into), "r" (from), "ic" (c)
          : "cc");
   }

   return res;
}

inline
unsigned int __shrd(unsigned int into, unsigned int from, unsigned int c)
{
   unsigned int res;

   if (__builtin_constant_p(into) && 
       __builtin_constant_p(from) && 
       __builtin_constant_p(c))
   {
      res = (into >> c) | (from << (32 - c));
   }
   else
   {
      asm("shrd %b3, %2, %0"
          : "=rm" (res)
          : "0" (into), "r" (from), "ic" (c)
          : "cc");
   }

   return res;
}

my_uint128_t lshift_uint128 (const my_uint128_t a, unsigned int b)
{
   my_uint128_t res;

   if (b < 32) {
      res.x = a.x << b;
      res.y = __shld(a.y, a.x, b);
      res.z = __shld(a.z, a.y, b);
      res.w = __shld(a.w, a.z, b);
   } else if (b < 64) {
      res.x = 0;
      res.y = a.x << (b - 32);
      res.z = __shld(a.y, a.x, b - 32);
      res.w = __shld(a.z, a.y, b - 32);
   } else if (b < 96) {
      res.x = 0;
      res.y = 0;
      res.z = a.x << (b - 64);
      res.w = __shld(a.y, a.x, b - 64);
   } else if (b < 128) {
      res.x = 0;
      res.y = 0;
      res.z = 0;
      res.w = a.x << (b - 96);
   } else {
      memset(&res, 0, sizeof(res));
   }

   return res;
}

my_uint128_t rshift_uint128 (const my_uint128_t a, unsigned int b)
{
   my_uint128_t res;

   if (b < 32) {
      res.x = __shrd(a.x, a.y, b);
      res.y = __shrd(a.y, a.z, b);
      res.z = __shrd(a.z, a.w, b);
      res.w = a.w >> b;
   } else if (b < 64) {
      res.x = __shrd(a.y, a.z, b - 32);
      res.y = __shrd(a.z, a.w, b - 32);
      res.z = a.w >> (b - 32);
      res.w = 0;
   } else if (b < 96) {
      res.x = __shrd(a.z, a.w, b - 64);
      res.y = a.w >> (b - 64);
      res.z = 0;
      res.w = 0;
   } else if (b < 128) {
      res.x = a.w >> (b - 96);
      res.y = 0;
      res.z = 0;
      res.w = 0;
   } else {
      memset(&res, 0, sizeof(res));
   }

   return res;
}

3) 追加は asm の恩恵を受ける可能性があります。これを試すことができます：

struct my_uint128_t
{
   unsigned int x;
   unsigned int y;
   unsigned int z;
   unsigned int w;
};

my_uint128_t add_uint128 (const my_uint128_t a, const my_uint128_t b)
{
   my_uint128_t res;

    asm ("addl %5, %[resx]\n\t"
         "adcl %7, %[resy]\n\t"
         "adcl %9, %[resz]\n\t"
         "adcl %11, %[resw]\n\t"
         : [resx] "=&r" (res.x), [resy] "=&r" (res.y), 
           [resz] "=&r" (res.z), [resw] "=&r" (res.w)
         : "%0"(a.x), "irm"(b.x), 
           "%1"(a.y), "irm"(b.y), 
           "%2"(a.z), "irm"(b.z), 
           "%3"(a.w), "irm"(b.w)
         : "cc");

   return res;
}

私はこれを打ち破っただけなので、自己責任で使用してください。私は Edison を持っていませんが、これは x86 で動作します。

更新：累積を行っているだけの場合（to += from上記のコードの代わりに考えてくださいc = a + b）、このコードはより適切に機能する可能性があります：

inline
void addto_uint128 (my_uint128_t *to, const my_uint128_t from)
{
   asm ("addl %[fromx], %[tox]\n\t"
        "adcl %[fromy], %[toy]\n\t"
        "adcl %[fromz], %[toz]\n\t"
        "adcl %[fromw], %[tow]\n\t"
        : [tox] "+&r"(to->x), [toy] "+&r"(to->y), 
          [toz] "+&r"(to->z), [tow] "+&r"(to->w)
        : [fromx] "irm"(from.x), [fromy] "irm"(from.y), 
          [fromz] "irm"(from.z), [fromw] "irm"(from.w)
        : "cc");
}

score 5 · Accepted Answer

外部ライブラリの使用がオプションである場合は、この質問を見てください。TTMathは、高精度の数学用の非常に単純なヘッダーです。32 ビットアーキテクチャでは、4 つの 32 ビットリムを持つttmath:UInt<4>128 ビットintタイプが作成されます。他のいくつかの代替手段は(u)int128_tBoost.Multiprecisionまたはcalccrypto /uint128_tにあります

自分で書かなければならない場合は、SOにはすでに多くのソリューションがあり、ここでそれらを要約します

足し算と引き算の場合、それは非常に簡単で簡単です。もちろん、キャリーを使用して、単語（大きな int ライブラリはしばしば limbs と呼ばれます）を最下位から上位に加算/減算します。

typedef struct INT128 {
    uint64_t H, L;
} my_uint128_t;

inline my_uint128_t add(my_uint128_t a, my_uint128_t b)
{
    my_uint128_t c;
    c.L = a.L + b.L;
    c.H = a.H + b.H + (c.L < a.L);  // c = a + b
    return c;
}

アセンブリ出力はコンパイラエクスプローラで確認できます。

コンパイラーはすでにダブルワード演算用の効率的なコードを生成できますが、質問でわかるように、高水準言語からマルチワード演算をコンパイルするときに「キャリー付き加算」を使用するほどスマートではありません。キャリーフラグ。したがってlong long、上記のように 2 を使用すると、読みやすくなるだけでなく、コンパイラがより効率的なコードを出力しやすくなります。

それでもパフォーマンス要件に合わない場合は、組み込みを使用するか、アセンブリで記述する必要があります。bignum{eax, ebx, ecx, edx} の 128ビット値に格納されている 128 ビット値を追加するには、次のコードを使用できます。

add edx, [bignum]
adc ecx, [bignum+4]
adc ebx, [bignum+8]
adc eax, [bignum+12]

同等の組み込み関数は、Clang では次のようになります。

unsigned *x, *y, *z, carryin=0, carryout;
z[0] = __builtin_addc(x[0], y[0], carryin, &carryout);
carryin = carryout;
z[1] = __builtin_addc(x[1], y[1], carryin, &carryout);
carryin = carryout;
z[2] = __builtin_addc(x[2], y[2], carryin, &carryout);
carryin = carryout;
z[3] = __builtin_addc(x[3], y[3], carryin, &carryout);

__builtin_uadd_overflowgccやMSVCおよびICC_addcarry_u32など、コンパイラでサポートされているものに組み込みを変更する必要があります。

詳細については、これらをお読みください

ビットシフトの場合は、128 ビット数値でのビット単位のシフト演算の質問で C ソリューションを見つけることができます。これは単純な左シフトですが、再帰呼び出しを展開してパフォーマンスを向上させることができます

void shiftl128 (
    unsigned int& a,
    unsigned int& b,
    unsigned int& c,
    unsigned int& d,
    size_t k)
{
    assert (k <= 128);
    if (k >= 32) // shifting a 32-bit integer by more than 31 bits is "undefined"
    {
        a=b;
        b=c;
        c=d;
        d=0;
        shiftl128(a,b,c,d,k-32);
    }
    else
    {
        a = (a << k) | (b >> (32-k));
        b = (b << k) | (c >> (32-k));
        c = (c << k) | (d >> (32-k));
        d = (d << k);
    }
}

32 ビット未満のシフトのアセンブリは、アセンブリ言語を使用した 128 ビットシフトの質問にありますか?

shld    edx, ecx, cl
shld    ecx, ebx, cl
shld    ebx, eax, cl
shl     eax, cl

右シフトも同様に実装するか、上記のリンクされた質問からコピーすることができます

乗算と除算ははるかに複雑であり、x86 での 2 つの 128 ビット整数の効率的な乗算/除算 (64 ビットなし)の質問で解決策を参照できます。

class int128_t
{
    uint32_t dw3, dw2, dw1, dw0;

    // Various constrctors, operators, etc...

    int128_t& operator*=(const int128_t&  rhs) __attribute__((always_inline))
    {
        int128_t Urhs(rhs);
        uint32_t lhs_xor_mask = (int32_t(dw3) >> 31);
        uint32_t rhs_xor_mask = (int32_t(Urhs.dw3) >> 31);
        uint32_t result_xor_mask = (lhs_xor_mask ^ rhs_xor_mask);
        dw0 ^= lhs_xor_mask;
        dw1 ^= lhs_xor_mask;
        dw2 ^= lhs_xor_mask;
        dw3 ^= lhs_xor_mask;
        Urhs.dw0 ^= rhs_xor_mask;
        Urhs.dw1 ^= rhs_xor_mask;
        Urhs.dw2 ^= rhs_xor_mask;
        Urhs.dw3 ^= rhs_xor_mask;
        *this += (lhs_xor_mask & 1);
        Urhs += (rhs_xor_mask & 1);

        struct mul128_t
        {
            int128_t dqw1, dqw0;
            mul128_t(const int128_t& dqw1, const int128_t& dqw0): dqw1(dqw1), dqw0(dqw0){}
        };

        mul128_t data(Urhs,*this);
        asm volatile(
        "push      %%ebp                            \n\
        movl       %%eax,   %%ebp                   \n\
        movl       $0x00,   %%ebx                   \n\
        movl       $0x00,   %%ecx                   \n\
        movl       $0x00,   %%esi                   \n\
        movl       $0x00,   %%edi                   \n\
        movl   28(%%ebp),   %%eax #Calc: (dw0*dw0)  \n\
        mull             12(%%ebp)                  \n\
        addl       %%eax,   %%ebx                   \n\
        adcl       %%edx,   %%ecx                   \n\
        adcl       $0x00,   %%esi                   \n\
        adcl       $0x00,   %%edi                   \n\
        movl   24(%%ebp),   %%eax #Calc: (dw1*dw0)  \n\
        mull             12(%%ebp)                  \n\
        addl       %%eax,   %%ecx                   \n\
        adcl       %%edx,   %%esi                   \n\
        adcl       $0x00,   %%edi                   \n\
        movl   20(%%ebp),   %%eax #Calc: (dw2*dw0)  \n\
        mull             12(%%ebp)                  \n\
        addl       %%eax,   %%esi                   \n\
        adcl       %%edx,   %%edi                   \n\
        movl   16(%%ebp),   %%eax #Calc: (dw3*dw0)  \n\
        mull             12(%%ebp)                  \n\
        addl       %%eax,   %%edi                   \n\
        movl   28(%%ebp),   %%eax #Calc: (dw0*dw1)  \n\
        mull              8(%%ebp)                  \n\
        addl       %%eax,   %%ecx                   \n\
        adcl       %%edx,   %%esi                   \n\
        adcl       $0x00,   %%edi                   \n\
        movl   24(%%ebp),   %%eax #Calc: (dw1*dw1)  \n\
        mull              8(%%ebp)                  \n\
        addl       %%eax,   %%esi                   \n\
        adcl       %%edx,   %%edi                   \n\
        movl   20(%%ebp),   %%eax #Calc: (dw2*dw1)  \n\
        mull              8(%%ebp)                  \n\
        addl       %%eax,   %%edi                   \n\
        movl   28(%%ebp),   %%eax #Calc: (dw0*dw2)  \n\
        mull              4(%%ebp)                  \n\
        addl       %%eax,   %%esi                   \n\
        adcl       %%edx,   %%edi                   \n\
        movl   24(%%ebp),  %%eax #Calc: (dw1*dw2)   \n\
        mull              4(%%ebp)                  \n\
        addl       %%eax,   %%edi                   \n\
        movl   28(%%ebp),   %%eax #Calc: (dw0*dw3)  \n\
        mull               (%%ebp)                  \n\
        addl       %%eax,   %%edi                   \n\
        pop        %%ebp                            \n"
        :"=b"(this->dw0),"=c"(this->dw1),"=S"(this->dw2),"=D"(this->dw3)
        :"a"(&data):"%ebp");

        dw0 ^= result_xor_mask;
        dw1 ^= result_xor_mask;
        dw2 ^= result_xor_mask;
        dw3 ^= result_xor_mask;
        return (*this += (result_xor_mask & 1));
    }
};

また、 128bitタグで関連する質問を多数見つけることができます。

c++ - C++ で基本的な 128 ビット整数計算を行う効率的な方法は?

2 に答える 2

Related

Reference