caching - 自動ベクトル化と sse を使用したデータサイズ依存の高速化

Question

Intel Compiler の自動ベクトル化と sse を使用して、一部のコードを高速化しようとしています。すべての計算は、一部の構造体 node_t を別の構造体 w_t (関数 tr() および gen_tr()) に変換することです。関数 gen_tr() をベクトル化しようとしても、何の効果もありません。

データ格納形式を変更する場合、各構造体コンポーネントが異なる浮動小数点数の配列に格納されている場合、自動ベクトル化はうまく機能します。関数 genv_tr() を参照してください。

ssev_tr と呼ばれる sse を使用する関数 (N は 4 で均等に分割する必要があります)。

変換.c:

#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <xmmintrin.h>

static __inline__ unsigned long getCC(void)
{
    unsigned a, d;
    asm volatile("rdtsc" : "=a" (a), "=d" (d));
    return ((unsigned long)a) | (((unsigned long)d) << 32);
}

typedef struct {
    float x1, x2, x3, x4, x5;
} node_t;

typedef struct {
    float w1, w2, w3, w4;
} w_t;

void tr(node_t *n, float c1, float c2, w_t *w)
{
    const float nv = n->x1;
    const float N00T = n->x3 * c1;

    const float n1v = n->x2;
    const float N01T = n->x4 * c2;

    w->w1 = nv  - N00T;
    w->w2 = nv  + N00T;
    w->w3 = n1v - N01T;
    w->w4 = n1v + N01T;
}

__attribute__ ((noinline))
void gen_tr(node_t *n, w_t *w, const int N, float c1, float c2)
{
    int i;
    #pragma vector aligned
    #pragma ivdep
    for (i = 0; i < N; i++) {
        tr(n + i, c1, c2, w + i);
    }
}

__attribute__ ((noinline))
void genv_tr(float *x1, float *x2, float *x3, float *x4, float *x5, float *w1, float *w2, float *w3, float *w4, const int N, float c1, float c2)
{
    int i;
    #pragma vector aligned
    #pragma ivdep
    for (i = 0; i < N; i++) {
        const float N00T = x3[i] * c1;
        const float N01T = x4[i] * c2;

        w1[i] = x1[i] - N00T;
        w2[i] = x1[i] + N00T;
        w3[i] = x2[i] - N01T;
        w4[i] = x2[i] + N01T;
    }
}

__attribute__ ((noinline))
void ssev_tr(float *x1, float *x2, float *x3, float *x4, float *x5, float *w1, float *w2, float *w3, float *w4, const int N, float c1, float c2)
{
    __m128 *ws1 = (__m128*)w1;
    __m128 *ws2 = (__m128*)w2;
    __m128 *ws3 = (__m128*)w3;
    __m128 *ws4 = (__m128*)w4;
    
    __m128 *xs1 = (__m128*)x1;
    __m128 *xs2 = (__m128*)x2;
    __m128 *xs3 = (__m128*)x3;
    __m128 *xs4 = (__m128*)x4;
    
    const __m128 cs1 = _mm_set1_ps(c1);
    const __m128 cs2 = _mm_set1_ps(c2);
    
    int i;
    #pragma vector aligned
    #pragma ivdep
    for (i = 0; i < N / 4; i++) {
        const __m128 N00T = _mm_mul_ps(xs3[i], cs1);
        const __m128 N01T = _mm_mul_ps(xs4[i], cs2);

        ws1[i] = _mm_sub_ps(xs1[i], N00T);
        ws2[i] = _mm_add_ps(xs1[i], N00T);
        ws3[i] = _mm_sub_ps(xs2[i], N01T);
        ws4[i] = _mm_add_ps(xs2[i], N01T);
    }
}

#define test(func) \
    for (i = 0; i < n; i++) { \
        x[i].x1 = 1.0; \
        x[i].x2 = 2.0; \
        x[i].x3 = 2.0; \
        x[i].x4 = 2.0; \
        x[i].x5 = 2.0; \
    } \
    \
    t1 = getCC(); \
    for (i = 0; i < rep; i++) { \
        func(x, w, n, c1, c2); \
    } \
    t2 = getCC(); \
    printf("\t%f", ((double)(t2 - t1)) / n / rep);

#define test1(func) \
    for (i = 0; i < n; i++) { \
        x1[i] = 1.0; \
        x2[i] = 2.0; \
        x3[i] = 2.0; \
        x4[i] = 2.0; \
        x5[i] = 2.0; \
    } \
    \
    t1 = getCC(); \
    for (i = 0; i < rep; i++) { \
        func(x1, x2, x3, x4, x5, w1, w2, w3, w4, n, c1, c2); \
    } \
    t2 = getCC(); \
    printf("\t%f", ((double)(t2 - t1)) / n / rep);

int main(int argc, char *argv[])
{
    if (argc < 2) {
        printf("Usage %s vector_size\n", argv[0]);
    }
    int n = atoi(argv[1]);
    printf("%d", n);
    int rep = 100000000 / n;
    int i;
    int inc = 1;
    float c1 = 2.0, c2 = 1.0;
    unsigned long t1, t2;
    node_t *x = (node_t*)malloc(n * sizeof(node_t));
    w_t *w = (w_t*)malloc(n * sizeof(w_t));
    
    float *x1 = (float*)malloc(n * sizeof(float));
    float *x2 = (float*)malloc(n * sizeof(float));
    float *x3 = (float*)malloc(n * sizeof(float));
    float *x4 = (float*)malloc(n * sizeof(float));
    float *x5 = (float*)malloc(n * sizeof(float));
    
    float *w1 = (float*)malloc(n * sizeof(float));
    float *w2 = (float*)malloc(n * sizeof(float));
    float *w3 = (float*)malloc(n * sizeof(float));
    float *w4 = (float*)malloc(n * sizeof(float));
    
    test(gen_tr);
    test1(genv_tr);
    test1(ssev_tr);
    
    printf("\n");
    return 0;
}

コンパイルオプション: icc -O3 -Wall -W -vec-report6 transform.c -o transform

icc のバージョン - 12.1.2、OS - Fedora 16 x86_64、CPU - Intel Core2 Quad CPU Q8200。

次に、ステップ 64 で 16 から 3000 までのさまざまなサイズで実行します。スクリプトは次のとおりです。

#!/bin/bash

echo "" > run.log

for ((c=16;c<3000;c+=64))
do
./transform $c | tee -a run.log
done

ここでは、このスクリプト (size、gen_tr、genv_tr、ssev_tr) の作業結果の一部を、すべて 1 つの配列要素ごとに示しています。

16      7.710743        3.168577        3.253829
272     7.166493        1.983918        2.618569
528     7.121866        1.920195        2.567109
784     7.115007        1.899451        2.549645
1040    8.104026        2.481062        2.944317
1296    8.137537        5.105032        5.104614
1552    8.118534        5.068812        5.064211
1808    8.138309        5.077831        5.085015
2064    8.149699        5.107503        5.069958
2320    8.164556        5.080981        5.099313
2576    8.151524        5.086056        5.089294
2832    8.212946        5.061927        5.072261

ベクトル化されたバージョンの関数を使用すると、サイズが 1000 ほど大きく変化するのはなぜですか? キャッシュミスのせい？すべてのデータ範囲で同じ速度を保存することは可能ですか?

score 1 · Accepted Answer

8 つの float 配列があります。サイズが 1000 の場合、テストは約 32kB のデータを操作しています。L1 キャッシュが少し大きい (64kB) 場合でも、連想性のために、L1 キャッシュは 32kB のデータすべてを同時に保持できない可能性があります。

テストは繰り返され、同じデータが何度も処理されます。次の 2 つのケースを考えてみましょう。

サイズ = 528 : 8 つの配列が L1 キャッシュにうまく収まります。各テスト反復 (最初の反復を除く) では、データに高速にアクセスできます。
サイズ = 1268 : 8 つの配列が同時に L1 キャッシュに収まりません。テストの反復ごとに L1 からデータが削除され続けるため、実質的にすべての読み取りと書き込みが L2 に移動します。

したがって、入力サイズ 1000 でのジャンプは、部分的にはテストのアーティファクトですが、完全ではありません。現実の世界では、L1 キャッシュに必要なすべてのデータが既にある場合、genv_tr は非常に高速になります。ただし、サイズが 1000 を超える入力では、すべての入力が L1 キャッシュに収まらないため、一部のアクセスは確実に L2 に移動します。

caching - 自動ベクトル化と sse を使用したデータ サイズ依存の高速化

変換.c:

1 に答える 1

Related

Reference

caching - 自動ベクトル化と sse を使用したデータサイズ依存の高速化