math - GPUで定数による除算の魔法の乗数を見つける方法は?

Question

次の計算の実装を検討していました。ここで、divisorはゼロではなく、2 のべき乗ではありません

unsigned multiplier(unsigned divisor)
{
    unsigned shift = 31 - clz(divisor);
    uint64_t t = 1ull << (32 + shift);

    return t / div;
}

これは、64 ビットの整数および浮動小数点命令を欠いているが、32 ビットの融合乗加算を備えている可能性があるプロセッサ (同様に除算がない GPU など) にとって効率的な方法です。

この計算は、除数の最適化に関係する「魔法の乗数」を見つけるのに役立ちます。これは、除数が事前にわかっている場合に、上位乗算命令とそれに続くビットごとのシフトです。コンパイラで使用されるコードやlibdivideの参照コードとは異なり、そのような乗数の最大値を検出します。

もう 1 つのひねりは、私が見ていたアプリケーションでは、divisorほとんどの場合、float型で表現できると予想していたことです。したがって、これらの除数を処理する効率的な「高速パス」と、残りを処理するサイズが最適化された「低速パス」を用意することは理にかなっています。

score 0 · Accepted Answer

私が思いついた解決策は、この特定のシナリオ (被除数は 2 のべき乗) に特化した剰余による長い除算を実行し、「高速パス」で 6 回または 8 回の FMA 操作を実行してから、8 回の反復で二分探索を実行します。「スローパス」。

次のプログラムは、提案されたソリューションの徹底的なテストを実行します (FMA 対応の CPU で約 1 ～ 2 分必要です)。

#include <math.h>
#include <stdint.h>
#include <stdio.h>

struct quomod {
    unsigned long quo;
    unsigned long mod;
};

// Divide 1 << (32 + SHIFT) by DIV, return quotient and modulus
struct quomod
quomod_ref(unsigned div, unsigned shift)
{
    uint64_t t = 1ull << (32 + shift);

    return (struct quomod){t / div, t % div};
}

// Reinterpret given bits as float
static inline float int_as_float(uint32_t bits)
{
    return (union{ unsigned b; float f; }){bits}.f;
}

// F contains integral value in range [-2**32 .. 2**32]. Convert it to integer,
// with wrap-around on overflow. If the GPU implements saturating conversion,
// it also may be used
static inline uint32_t cvt_f32_u32_wrap(float f)
{
    return (uint32_t)(long long)f;
}

struct quomod
quomod_alt(unsigned div, unsigned shift)
{
    // t = float(1ull << (32 + shift))
    float t = int_as_float(0x4f800000 + (shift << 23));

    // mask with max(0, shift - 23) low bits zero
    uint32_t mask = (int)(~0u << shift) >> 23;

    // No roundoff in conversion
    float div_f = div & mask;

    // Caution: on the CPU this is correctly rounded, but on the GPU
    // native reciprocal may be off by a few ULP, in which case a
    // refinement step may be necessary:
    // recip = fmaf(fmaf(recip, -div_f, 1), recip, recip)
    float recip = 1.f / div_f;

    // Higher part of the quotient, integer in range 2^31 .. 2^32
    float quo_hi = t * recip;

    // No roundoff
    float res = fmaf(quo_hi, -div_f, t);

    float quo_lo_approx = res * recip;

    float res2 = fmaf(quo_lo_approx, -div_f, res);

    // Lower part of the quotient, may be negative
    float quo_lo = floorf(fmaf(res2, recip, quo_lo_approx));

    // Remaining part of the dividend
    float mod_f = fmaf(quo_lo, -div_f, res);

    // Quotient as sum of parts
    unsigned quo = cvt_f32_u32_wrap(quo_hi) + (int)quo_lo;

    // Adjust quotient down if remainder is negative
    if (mod_f < 0) {
        quo--;
    }

    if (div & ~mask) {
        // The quotient was computed for a truncated divisor, so
        // it matches or exceeds the true result

        // High part of the dividend
        uint32_t ref_hi = 1u << shift;

        // Unless quotient is zero after wraparound, increment it so
        // it's higher than true quotient (its high bit must be 1)
        quo -= (int)quo >> 31;

        // Binary search for the true quotient; search invariant:
        // quo is higher than true quotient, quo-2*bit is lower
        for (unsigned bit = 256; bit; bit >>= 1) {
            unsigned try = quo - bit;
            // One multiply-high instruction
            uint32_t prod_hi = 1ull * try * div >> 32;
            if (prod_hi >= ref_hi)
                quo = try;
        }
        // quo is zero or exceeds the true quotient, so quo-1 must be it
        quo--;
    }

    // Use the "left-pointing short magic wand" operator
    // to recover the remainder
    return (struct quomod){quo, quo *- div};
}

int main()
{
    fprintf(stderr, "%66c\r[", ']');
    unsigned step = 1;
    for (unsigned div = 3; div; div += step) {
        // Progress bar
        if (!(div & 0x03ffffff)) fprintf(stderr, "=");
        // Skip powers of two
        if (!(div & (div-1))) continue;
        unsigned shift = 31 - __builtin_clz(div);

        struct quomod ref = quomod_ref(div, shift);
        struct quomod alt = quomod_alt(div, shift);

        if (ref.quo != alt.quo || ref.mod != alt.mod) {
            printf("\nerror at %u\n", div);
            return 1;
        }
    }
    fprintf(stderr, "=\nAll ok\n");
    return 0;
}

math - GPUで定数による除算の魔法の乗数を見つける方法は?

1 に答える 1

Related

Reference