c - Cで最速のデインターリーブ操作？

Question

2つの異なる配列とmixedのインターリーブされたバイトを含むバイト配列へのポインタがあります。次のようになります。array1array2mixed

a1b2c3d4...

私がする必要があるのは、バイトをデインターリーブすることarray1 = abcd...ですarray2 = 1234...。私はmixed前もっての長さを知っています、そしてとの長さarray1はarray2同等であり、両方ともに等しいmixed / 2です。

これが私の現在の実装です（array1そしてarray2すでに割り当てられています）：

int i, j;
int mixedLength_2 = mixedLength / 2;
for (i = 0, j = 0; i < mixedLength_2; i++, j += 2)
{
    array1[i] = mixed[j];
    array2[i] = mixed[j+1];
}

これにより、コストのかかる乗算または除算演算を回避できますが、それでも十分な速度で実行されません。memcpy低レベルのブロックコピー操作を使用してプロセスを高速化できるインデクサーが必要なようなものがあることを期待しています。私が現在持っているものよりも速い実装はありますか？

編集

ターゲットプラットフォームは、iOSおよびMac用のObjective-Cです。iOSデバイスでは高速操作がより重要であるため、iOSをターゲットとするソリューションは何もないよりも優れています。

アップデート

回答してくれたすべての人、特にStephen Canon、Graham Lee、Meckiに感謝します。これが私の「マスター」関数で、可能な場合はStephenのNEON組み込み関数を使用し、そうでない場合はMeckiが提案するように反復回数を減らしたGrahamのユニオンカーソルを使用します。

void interleave(const uint8_t *srcA, const uint8_t *srcB, uint8_t *dstAB, size_t dstABLength)
{
#if defined __ARM_NEON__
    // attempt to use NEON intrinsics

    // iterate 32-bytes at a time
    div_t dstABLength_32 = div(dstABLength, 32);
    if (dstABLength_32.rem == 0)
    {
        while (dstABLength_32.quot --> 0)
        {
            const uint8x16_t a = vld1q_u8(srcA);
            const uint8x16_t b = vld1q_u8(srcB);
            const uint8x16x2_t ab = { a, b };
            vst2q_u8(dstAB, ab);
            srcA += 16;
            srcB += 16;
            dstAB += 32;
        }
        return;
    }

    // iterate 16-bytes at a time
    div_t dstABLength_16 = div(dstABLength, 16);
    if (dstABLength_16.rem == 0)
    {
        while (dstABLength_16.quot --> 0)
        {
            const uint8x8_t a = vld1_u8(srcA);
            const uint8x8_t b = vld1_u8(srcB);
            const uint8x8x2_t ab = { a, b };
            vst2_u8(dstAB, ab);
            srcA += 8;
            srcB += 8;
            dstAB += 16;
        }
        return;
    }
#endif

    // if the bytes were not aligned properly
    // or NEON is unavailable, fall back to
    // an optimized iteration

    // iterate 8-bytes at a time
    div_t dstABLength_8 = div(dstABLength, 8);
    if (dstABLength_8.rem == 0)
    {
        typedef union
        {
            uint64_t wide;
            struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; uint8_t a3; uint8_t b3; uint8_t a4; uint8_t b4; } narrow;
        } ab8x8_t;

        uint64_t *dstAB64 = (uint64_t *)dstAB;
        int j = 0;
        for (int i = 0; i < dstABLength_8.quot; i++)
        {
            ab8x8_t cursor;
            cursor.narrow.a1 = srcA[j  ];
            cursor.narrow.b1 = srcB[j++];
            cursor.narrow.a2 = srcA[j  ];
            cursor.narrow.b2 = srcB[j++];
            cursor.narrow.a3 = srcA[j  ];
            cursor.narrow.b3 = srcB[j++];
            cursor.narrow.a4 = srcA[j  ];
            cursor.narrow.b4 = srcB[j++];
            dstAB64[i] = cursor.wide;
        }
        return;
    }

    // iterate 4-bytes at a time
    div_t dstABLength_4 = div(dstABLength, 4);
    if (dstABLength_4.rem == 0)
    {
        typedef union
        {
            uint32_t wide;
            struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; } narrow;
        } ab8x4_t;

        uint32_t *dstAB32 = (uint32_t *)dstAB;
        int j = 0;
        for (int i = 0; i < dstABLength_4.quot; i++)
        {
            ab8x4_t cursor;
            cursor.narrow.a1 = srcA[j  ];
            cursor.narrow.b1 = srcB[j++];
            cursor.narrow.a2 = srcA[j  ];
            cursor.narrow.b2 = srcB[j++];
            dstAB32[i] = cursor.wide;
        }
        return;
    }

    // iterate 2-bytes at a time
    div_t dstABLength_2 = div(dstABLength, 2);
    typedef union
    {
        uint16_t wide;
        struct { uint8_t a; uint8_t b; } narrow;
    } ab8x2_t;

    uint16_t *dstAB16 = (uint16_t *)dstAB;
    for (int i = 0; i < dstABLength_2.quot; i++)
    {
        ab8x2_t cursor;
        cursor.narrow.a = srcA[i];
        cursor.narrow.b = srcB[i];
        dstAB16[i] = cursor.wide;
    }
}

void deinterleave(const uint8_t *srcAB, uint8_t *dstA, uint8_t *dstB, size_t srcABLength)
{
#if defined __ARM_NEON__
    // attempt to use NEON intrinsics

    // iterate 32-bytes at a time
    div_t srcABLength_32 = div(srcABLength, 32);
    if (srcABLength_32.rem == 0)
    {
        while (srcABLength_32.quot --> 0)
        {
            const uint8x16x2_t ab = vld2q_u8(srcAB);
            vst1q_u8(dstA, ab.val[0]);
            vst1q_u8(dstB, ab.val[1]);
            srcAB += 32;
            dstA += 16;
            dstB += 16;
        }
        return;
    }

    // iterate 16-bytes at a time
    div_t srcABLength_16 = div(srcABLength, 16);
    if (srcABLength_16.rem == 0)
    {
        while (srcABLength_16.quot --> 0)
        {
            const uint8x8x2_t ab = vld2_u8(srcAB);
            vst1_u8(dstA, ab.val[0]);
            vst1_u8(dstB, ab.val[1]);
            srcAB += 16;
            dstA += 8;
            dstB += 8;
        }
        return;
    }
#endif

    // if the bytes were not aligned properly
    // or NEON is unavailable, fall back to
    // an optimized iteration

    // iterate 8-bytes at a time
    div_t srcABLength_8 = div(srcABLength, 8);
    if (srcABLength_8.rem == 0)
    {
        typedef union
        {
            uint64_t wide;
            struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; uint8_t a3; uint8_t b3; uint8_t a4; uint8_t b4; } narrow;
        } ab8x8_t;

        uint64_t *srcAB64 = (uint64_t *)srcAB;
        int j = 0;
        for (int i = 0; i < srcABLength_8.quot; i++)
        {
            ab8x8_t cursor;
            cursor.wide = srcAB64[i];
            dstA[j  ] = cursor.narrow.a1;
            dstB[j++] = cursor.narrow.b1;
            dstA[j  ] = cursor.narrow.a2;
            dstB[j++] = cursor.narrow.b2;
            dstA[j  ] = cursor.narrow.a3;
            dstB[j++] = cursor.narrow.b3;
            dstA[j  ] = cursor.narrow.a4;
            dstB[j++] = cursor.narrow.b4;
        }
        return;
    }

    // iterate 4-bytes at a time
    div_t srcABLength_4 = div(srcABLength, 4);
    if (srcABLength_4.rem == 0)
    {
        typedef union
        {
            uint32_t wide;
            struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; } narrow;
        } ab8x4_t;

        uint32_t *srcAB32 = (uint32_t *)srcAB;
        int j = 0;
        for (int i = 0; i < srcABLength_4.quot; i++)
        {
            ab8x4_t cursor;
            cursor.wide = srcAB32[i];
            dstA[j  ] = cursor.narrow.a1;
            dstB[j++] = cursor.narrow.b1;
            dstA[j  ] = cursor.narrow.a2;
            dstB[j++] = cursor.narrow.b2;
        }
        return;
    }

    // iterate 2-bytes at a time
    div_t srcABLength_2 = div(srcABLength, 2);
    typedef union
    {
        uint16_t wide;
        struct { uint8_t a; uint8_t b; } narrow;
    } ab8x2_t;

    uint16_t *srcAB16 = (uint16_t *)srcAB;
    for (int i = 0; i < srcABLength_2.quot; i++)
    {
        ab8x2_t cursor;
        cursor.wide = srcAB16[i];
        dstA[i] = cursor.narrow.a;
        dstB[i] = cursor.narrow.b;
    }
}

score 10 · Accepted Answer

頭のてっぺんから、2 チャネルのバイトデータのインターリーブを解除するためのライブラリ関数を知りません。ただし、そのような機能を要求するには、Apple にバグレポートを提出する価値があります。

それまでの間、NEON または SSE 組み込み関数を使用して、このような関数をベクトル化するのは非常に簡単です。具体的には、ARM では、vld1q_u8各ソース配列からベクターをロードし、vuzpq_u8インターリーブを解除し、vst1q_u8結果のベクターを格納するために使用する必要があります。これは、私がテストしたり、作成しようとさえしていない大まかなスケッチですが、一般的なアイデアを説明する必要があります。より洗練された実装が確実に可能です (特に、NEON は1 つの命令で2 つの16B レジスタをロード/ストアできますが、コンパイラはこれを実行できない可能性があり、バッファの長さによっては、ある程度のパイプライン化および/または展開が有益な場合があります)。それは）：

#if defined __ARM_NEON__
#   include <arm_neon.h>
#endif
#include <stdint.h>
#include <stddef.h>

void deinterleave(uint8_t *mixed, uint8_t *array1, uint8_t *array2, size_t mixedLength) {
#if defined __ARM_NEON__
    size_t vectors = mixedLength / 32;
    mixedLength %= 32;
    while (vectors --> 0) {
        const uint8x16_t src0 = vld1q_u8(mixed);
        const uint8x16_t src1 = vld1q_u8(mixed + 16);
        const uint8x16x2_t dst = vuzpq_u8(src0, src1);
        vst1q_u8(array1, dst.val[0]);
        vst1q_u8(array2, dst.val[1]);
        mixed += 32;
        array1 += 16;
        array2 += 16;
    }
#endif
    for (size_t i=0; i<mixedLength/2; ++i) {
        array1[i] = mixed[2*i];
        array2[i] = mixed[2*i + 1];
    }
}

score 3 · Accepted Answer

私はこれを軽くテストしただけですが、あなたのバージョンの少なくとも2倍の速さのようでした：

typedef union {
uint16_t wide;
struct { uint8_t top; uint8_t bottom; } narrow;
} my_union;

uint16_t *source = (uint16_t *)mixed;
for (int i = 0; i < mixedLength/2; i++)
{
    my_union cursor;
    cursor.wide = source[i];
    array1[i] = cursor.narrow.top;
    array2[i] = cursor.narrow.bottom;
}

構造体のパッキングに注意していなかったことに注意してください。ただし、この場合、このアーキテクチャでは問題ありません。topまた、誰かが私の名前の選択に不満を言うかもしれないことに注意してくださいbottom。私はあなたがあなたが必要とする整数の半分を知っていると思います。

score 2 · Accepted Answer

さて、これがあなたの元の方法です：

static void simpleDeint (
    uint8_t * array1, uint8_t * array2, uint8_t * mixed, int mixedLength
) {
    int i, j;
    int mixedLength_2 = mixedLength / 2;
    for (i = 0, j = 0; i < mixedLength_2; i++, j += 2)
    {
        array1[i] = mixed[j];
        array2[i] = mixed[j+1];
    }
}

1000万のエントリがあり-O3（コンパイラは最大速度に最適化されます）、Macでこれを1秒間に154回実行できます。

これが私の最初の提案です：

static void structDeint (
    uint8_t * array1, uint8_t * array2, uint8_t * mixed, int mixedLength
) {
    int i;
    int len;
    uint8_t * array1Ptr = (uint8_t *)array1;
    uint8_t * array2Ptr = (uint8_t *)array2;
    struct {
        uint8_t byte1;
        uint8_t byte2;
    } * tb = (void *)mixed;

    len = mixedLength / 2;
    for (i = 0; i < len; i++) {
      *(array1Ptr++) = tb->byte1;
      *(array2Ptr++) = tb->byte2;
      tb++;
    }
}

以前と同じカウントと最適化で、1秒あたり193回の実行が得られます。

さて、グラハム・リーからの提案：

static void unionDeint (
    uint8_t * array1, uint8_t * array2, uint8_t * mixed, int mixedLength
) {
    union my_union {
        uint16_t wide;
        struct { uint8_t top; uint8_t bottom; } narrow;
    };

    uint16_t * source = (uint16_t *)mixed;
    for (int i = 0; i < mixedLength/2; i++) {
        union my_union cursor;
        cursor.wide = source[i];
        array1[i] = cursor.narrow.top;
        array2[i] = cursor.narrow.bottom;
    }
}

以前と同じセットアップで、毎秒198回実行されます（注：このメソッドはエンディアンセーフではなく、結果はCPUエンディアンによって異なります。この場合、ARMはリトルエンディアンであるため、array1とarray2はおそらくスワップされるため、コードでスワップする必要があります。）。

これが私のこれまでの最高のものです：

static void uint32Deint (
    uint8_t * array1, uint8_t * array2, uint8_t * mixed, int mixedLength
) {
    int i;
    int count;
    uint32_t * fourBytes = (void *)mixed;
    uint8_t * array1Ptr = (uint8_t *)array1;
    uint8_t * array2Ptr = (uint8_t *)array2;


    count = mixedLength / 4;
    for (i = 0; i < count; i++) {
        uint32_t temp = *(fourBytes++);

#if __LITTLE_ENDIAN__
        *(array1Ptr++) = (uint8_t)(temp & 0xFF);
        temp >>= 8;
        *(array2Ptr++) = (uint8_t)(temp & 0xFF);
        temp >>= 8;
        *(array1Ptr++) = (uint8_t)(temp & 0xFF);
        temp >>= 8;
        *(array2Ptr++) = tb->byte2;

#else
        *(array1Ptr++) = (uint8_t)(temp >> 24);
        *(array2Ptr++) = (uint8_t)((temp >> 16) & 0xFF);
        *(array1Ptr++) = (uint8_t)((temp >>  8) & 0xFF);
        *(array2Ptr++) = (uint8_t)(temp & 0xFF);
#endif
    }
    // Either it is a multiple of 4 or a multiple of 2.
    // If it is a multiple of 2, 2 bytes are left over.
    if (count * 4 != mixedLength) {
        *(array1Ptr) = mixed[mixedLength - 2];
        *(array2Ptr) = mixed[mixedLength - 1];
    }
}

上記と同じセットアップで、1秒間に219回、間違えない限り、どちらのエンディアンでも機能するはずです。

score 1 · Accepted Answer

グラハムのソリューションをお勧めしますが、これが本当に速度が重要であり、アセンブラーを使用する意思がある場合は、さらに高速になります。

アイデアはこれです：

から32ビット整数全体を読み取りmixedます。'a1b2'を取得します。
下位16ビットを8ビットローテーションして「1ab2」を取得します（これはARMのデフォルトであり、したがってApple A＃であるため、最初の2バイトが下位バイトであるためリトルエンディアンを使用しています）。
32ビットレジスタ全体を8ビット右に回転させて（私はそれが正しいと思います...）、「21ab」を取得します。
下位16ビットを8ビット回転して、「12ab」を取得します
下位8ビットをに書き込みますarray2。
32ビットレジスタ全体を16ビットローテーションします。
下位8ビットをに書き込みますarray1
array116ビット、array216ビット、および32ビットずつ進みmixedます。
繰り返す。

2つのメモリ読み取り（Grahamのバージョンまたは同等のものを使用すると仮定）と4つのメモリを1つのメモリ読み取り、2つのメモリ書き込み、および4つのレジスタ操作と交換しました。操作の数は6から7に増えましたが、レジスタ操作はメモリ操作よりも高速であるため、その方が効率的です。また、16ビットではなく32ビットから読み取るため、mixed反復管理を半分に削減しました。

PS：理論的には、これは64ビットアーキテクチャでも実行できますが、「a1b2c3d4」でこれらすべてのローテーションを実行すると、狂気に陥ります。

score 1 · Accepted Answer

x86 SSE の場合は、packとのpunpck手順が必要です。非破壊的な 3 オペランド命令の利便性のために AVX を使用する例。(AVX2 256b 幅の命令は使用しません。256b の pack/unpck 命令は 128b の低レーンと高レーンで 2 つの 128b のアンパックを実行するため、正しい最終順序で物事を取得するにはシャッフルが必要になります。)

次の組み込みバージョンは同じように機能します。Asm の指示は、簡単な回答を書くためだけに入力するのに短くなります。

インターリーブ:abcdおよび1234-> a1b2c3d4:

# loop body:
vmovdqu    (%rax), %xmm0  # load the sources
vmovdqu    (%rbx), %xmm1
vpunpcklbw %xmm0, %xmm1, %xmm2  # low  halves -> 128b reg
vpunpckhbw %xmm0, %xmm2, %xmm3  # high halves -> 128b reg
vmovdqu    %xmm2, (%rdi)   # store the results
vmovdqu    %xmm3, 16(%rdi)
# blah blah some loop structure.

`punpcklbw` interleaves the bytes in the low 64 of the two source `xmm` registers.  There are `..wd` (word->dword), and dword->qword versions which would be useful for 16 or 32bit elements.

デインターリーブ: a1b2c3d4->abcdおよび1234

#outside the loop
vpcmpeqb    %xmm5, %xmm5   # set to all-1s
vpsrlw     $8, %xmm5, %xmm5   # every 16b word has low 8b = 0xFF, high 8b = 0.

# loop body
vmovdqu    (%rsi), %xmm2     # load two src chunks
vmovdqu    16(%rsi), %xmm3
vpand      %xmm2, %xmm5, %xmm0  # mask to leave only the odd bytes
vpand      %xmm3, %xmm5, %xmm1
vpackuswb  %xmm0, %xmm1, %xmm4
vmovdqu    %xmm4, (%rax)    # store 16B of a[]
vpsrlw     $8, %xmm2, %xmm6     # even bytes -> odd bytes
vpsrlw     $8, %xmm3, %xmm7
vpackuswb  %xmm6, %xmm7, %xmm4
vmovdqu    %xmm4, (%rbx)

もちろん、これははるかに少ないレジスタを使用できます。パフォーマンスではなく、読みやすさのためにレジスタの再利用を避けました。ハードウェアレジスタの名前変更により、以前の値に依存しないものから始める限り、再利用は問題になりません。(例:movdではないmovss、またはpinsrd.)

packデインターリーブは、命令が符号付きまたは符号なしの飽和を行うため、非常に多くの作業が必要です。そのため、各 16b 要素の上位 8b を最初にゼロにする必要があります。

pshufb別の方法として、1 つのソースレジスタの奇数ワードまたは偶数ワードをレジスタの下位 64 にパックする方法があります。ただし、AMD XOP 命令セットの以外では、VPPERM2 つのレジスタからバイトを一度に選択できるシャッフルはありません (Altivec の大人気ののようにvperm)。したがって、SSE/AVX だけでは、128b のインターリーブデータごとに 2 つのシャッフルが必要になります。また、ストアポートの使用がボトルネックになる可能性があるため、punpck2 つの 64 ビットチャンクをa1 つのレジスタに結合して 128 ビットストアをセットアップします。

AMD XOP では、デインターリーブは 2x128b ロード、2VPPERMおよび 2x128b ストアになります。

score -1 · Accepted Answer

時期尚早の最適化は悪い
あなたのコンパイラはおそらくあなたよりも最適化に優れています。

とは言うものの、コンパイラーが持つことのできないデータのセマンティック知識を持っているため、コンパイラーを支援するためにできることがあります。

ネイティブワードサイズまで、できるだけ多くのバイトを読み書きします-メモリ操作はコストがかかるため、可能な場合はレジスタの操作を行います
ループを展開します-「Duff'sDevice」を調べます。

FWIW、私はあなたのコピーループの2つのバージョンを作成しました。1つはあなたのものとほとんど同じで、2つ目はほとんどが「最適な」（まだ単純ですが）Cコードと見なすものを使用しています。

void test1(byte *p, byte *p1, byte *p2, int n)
{
    int i, j;
    for (i = 0, j = 0; i < n / 2; i++, j += 2) {
        p1[i] = p[j];
        p2[i] = p[j + 1];
    }
}

void test2(byte *p, byte *p1, byte *p2, int n)
{
    while (n) {
        *p1++ = *p++;
        *p2++ = *p++;
        n--; n--;
    }
}

gcc -O3 -SIntel x86では、どちらもほぼ同じアセンブリコードを生成しました。内側のループは次のとおりです。

LBB1_2:
    movb    -1(%rdi), %al
    movb    %al, (%rsi)
    movb    (%rdi), %al
    movb    %al, (%rdx)
    incq    %rsi
    addq    $2, %rdi
    incq    %rdx
    decq    %rcx
    jne LBB1_2

と

LBB2_2:
    movb    -1(%rdi), %al
    movb    %al, (%rsi)
    movb    (%rdi), %al
    movb    %al, (%rdx)
    incq    %rsi
    addq    $2, %rdi
    incq    %rdx
    addl    $-2, %ecx
    jne LBB2_2

n / 2どちらも同じ数の命令を持っており、最初のバージョンがまでカウントアップし、2番目のバージョンがゼロまでカウントダウンするという理由だけで違いが説明されます。

ここで編集するより良いバージョンがあります：

/* non-portable - assumes little endian */
void test3(byte *p, byte *p1, byte *p2, int n)
{
    ushort *ps = (ushort *)p;

    n /= 2;
    while (n) {
        ushort n = *ps++;
        *p1++ = n;
        *p2++ = n >> 8;
    }
}

その結果：

LBB3_2:
    movzwl  (%rdi), %ecx
    movb    %cl, (%rsi)
    movb    %ch, (%rdx)  # NOREX
    addq    $2, %rdi
    incq    %rsi
    incq    %rdx
    decq    %rax
    jne LBB3_2

%clとへの即時アクセスを利用するため、これは1つ少ない命令です%ch。

c - Cで最速のデインターリーブ操作？

6 に答える 6

Related

Reference