cuda - cudaEvent を使用して、コードの実行時間を測定しました。しかし、私は何かパズルを見つけました

Question

次のコードを使用して、コードの実行時間を測定しました。

#include <cuda.h>
#include <cuda_runtime_api.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/find.h>
#include <thrust/sort.h>
#include <thrust/unique.h>
#include <thrust/distance.h>
#include <thrust/functional.h>
#include <thrust/transform.h>
#include <thrust/pair.h>
#include <thrust/remove.h>

#include <math.h>
#include <fstream>
#include <string>
#include <cstdlib>
#include <iostream>
#include <stdlib.h>
using namespace std;

const int MINCOUNTS = 20;
const int h = 10;
const int dim = 2;
//const int h2 = pow(double(h),double(dim));



struct DataType
{
    float d[dim];
};





void loadData(thrust::host_vector<DataType>& D_,string dir_, DataType& gt)
{



    fstream in(dir_.c_str(),ios::in);
    string tline;
    string::size_type position;
    getline(in,tline);

    int flag = atoi(tline.c_str());
    if(flag != 1)
    {
        cout<<"there is problem in file : "<<dir_<<endl;
        exit(-1);
    }

    getline(in,tline);
    int tot = atoi(tline.c_str());

    getline(in,tline);




    for(int i = 0; i < dim - 1; i++)
    {
        position = tline.find(" ");
        gt.d[i] = atof(tline.substr(0,position).c_str());
        tline = tline.substr(position+1, tline.size() - position);
    }
    gt.d[dim-1] = atof(tline.c_str());

    DataType dt;
    for(int i = 0; i < tot-1; i++)
    {
        getline(in,tline);
        for(int i = 0; i < dim - 1; i++)
        {
            position = tline.find(" ");
            dt.d[i] = atof(tline.substr(0,position).c_str());
            tline = tline.substr(position+1, tline.size() - position);
        }
        dt.d[dim-1] = atof(tline.c_str());
        D_.push_back(dt);
    }
}



__global__ void initialSM(int *gpu_Mchanged1, int *gpu_Schanged1,int N)
{
    int index = blockIdx.x;
    if(index < N)
    {
        gpu_Mchanged1[index] = index;
        gpu_Schanged1[index] = index;
    }

}


//parallelCal<<<N,1>>>(gpu_Schanged1,gpu_input, gpu_msPoint, N, h);
__global__ void parallelCal(int* gpu_Schanged1, DataType *input, DataType *msPoint, int tot) // h is the band-width of the kernel function;
{

    int index = blockIdx.x;
    int dis = 0;

    int ii = 0;
    int i0 = 0;

    int inlierNum = 0;
    //  double h2 = 10000;

    if(index < tot)
    {
        dis = 0;
        i0 = gpu_Schanged1[index];

        for(unsigned int i = 0; i < dim; i++)
            msPoint[index].d[i] = 0;

        for(int i = 0 ;i < tot ;i++)
        {
            ii = gpu_Schanged1[i];


            dis = 0;
            for(unsigned int j = 0; j < dim; j++)
            {
                dis += (input[i0].d[j] - input[ii].d[j])*(input[i0].d[j] - input[ii].d[j]);
                if(dis > pow(double(h),2.0))
                    break;
            }

            if (dis < pow(double(h),2.0))
            {
                inlierNum++;
                for(unsigned int j = 0; j < dim; j++)
                    msPoint[index].d[j] += (input[ii].d[j] - input[i0].d[j]);
            }
        }


        //      msPoint[index].d[0] = inlierNum;
        for(unsigned int j = 0; j < dim; j++)
        {
            msPoint[index].d[j] /= inlierNum;
            msPoint[index].d[j] += input[i0].d[j];
        }

    }
}


//nearestSearch<<<N,1>>>(gpu_Schanged1,gpu_Mchanged1,gpu_msPoint,gpu_input, N, gpu_Sunchanged, gpu_Munchanged);
__global__ void nearestSearch(int *gpu_Schanged1,int *gpu_Mchanged1, DataType *msPoint, DataType *input, int tot, int *Sunchanged, int *Munchanged)
{
    int index = blockIdx.x;
    float dis = 0;
    float disMin = 1000000;
    int flag = -1;
    int i1;
    if(index < tot)
    {

        for(int i = 0; i < tot; i++)
        {
            i1 = gpu_Schanged1[i];

            dis = 0;
            for(int j = 0; j < dim; j++)
                dis += (msPoint[index].d[j] - input[i1].d[j])*(msPoint[index].d[j] - input[i1].d[j]);

            if(dis <= disMin)
            {
                disMin = dis;
                flag = i1;
            }
        }
        Sunchanged[gpu_Schanged1[index]] = index;
        Munchanged[gpu_Schanged1[index]] = flag;
        gpu_Mchanged1[index] = flag;
    }
}
////routineTransmission<<<N,1>>>(loop1st,gpu_Schanged1,gpu_Mchanged1,gpu_Sunchanged, gpu_Munchanged,N);
__global__ void routineTransmission(bool loop1st, int *gpu_Schanged1,int *gpu_Mchanged1, int *gpu_Sunchanged,int *gpu_Munchanged, const int tot)
{
    int index = blockIdx.x;
    bool find2 = false;

    if(index < tot)
    {
        int lastOne = -1;
        int thisOne = -1;
        int indexIter = index;
        while(1)
        {



            if(loop1st)
            {
                lastOne = gpu_Mchanged1[indexIter];
                thisOne = gpu_Mchanged1[lastOne];

                if(lastOne == thisOne)
                {
                    gpu_Munchanged[gpu_Schanged1[index]] = thisOne;
                    gpu_Mchanged1[index] = thisOne;
                    break;
                }
                indexIter = thisOne;
            }

            else
            {
                //              gpu_Mchanged1[index] = gpu_Schanged1[index];

                while(1)
                {
                    lastOne = gpu_Mchanged1[indexIter];
                    for(int i = 0; i < tot; i++)
                    {
                        if(i == indexIter)
                            continue;

                        if(lastOne == gpu_Schanged1[i])
                        {
                            thisOne = i;
                            find2 = true;
                            break;
                        }
                    }
                    if(find2 == false)
                        break;
                    indexIter = thisOne;
                    find2 = false;

                }
                if(thisOne != index && thisOne != -1)
                {
                    gpu_Munchanged[gpu_Schanged1[index]] = gpu_Schanged1[thisOne];
                    gpu_Mchanged1[index] = gpu_Schanged1[thisOne];
                }
                break;
            }
        }
    }

}
//

__global__ void deleteCircle(int *gpu_Mchanged1, int *gpu_Munchanged, const int N, bool loop1st)
{
    int index = blockIdx.x;
    int router0, router1;
    if(index < N)
    {
        if(loop1st)
        {
            router0 = gpu_Mchanged1[index];
            router1 = gpu_Mchanged1[router0];
            while(1)
            {

                if(index == router0 || index == router1)
                {
                    gpu_Munchanged[index] = index;
                    break;
                }
                if(router0 == router1)
                    break;
                router0 = gpu_Mchanged1[router1];
                router1 = gpu_Mchanged1[router0];
            }
        }

    }


}
__global__ void checkTheClusterSize(int *gpu_Mchanged1, int *gpu_Schanged1, int *gpu_Munchanged, int *gpu_clusterSize, int smallTot, int tot)
{
    int index = blockIdx.x;
    if(index < smallTot)
    {
        int count = 0;
        for(unsigned int i = 0; i < tot; i++)
        {
            if(gpu_Munchanged[i] == gpu_Mchanged1[index])
                count++;
        }
        gpu_clusterSize[index] = count;
        if(count <= MINCOUNTS)
            gpu_Schanged1[index] = -1;
    }

}
__global__ void checkTheCenterNum(int *gpu_Munchanged,int *gpu_Sunchanged, int *gpu_Kcounts ,int tot)
{
    int index = blockIdx.x;
    if(index < tot)
    {
        if (gpu_Kcounts[gpu_Munchanged[index]] < MINCOUNTS)
        {
            gpu_Sunchanged[index] = -1;
        }
    }


}

struct increaseOne: public thrust::unary_function<int, int>
{
    int operator()(int a_){return a_++;}

};
//
__global__ void mergeCentreSimple(int* gpu_Munchanged, int *gpu_clusterSize, DataType* gpu_input,int *gpu_Schanged1, int *gpu_Mchanged1, int tot)
{
    int index = blockIdx.x;
    float dis = 0;
    float disMin = pow(double(h/2),2.0);
    int disMinIndex = -1;
    bool flag = false;
    if(index < tot)
    {
        for(unsigned int i = 0; i < tot; i++)
        {
            if(index == i)
                continue;


            dis = 0;
            for(unsigned int j = 0; j < dim; j++)
                dis += (gpu_input[gpu_Mchanged1[index]].d[j] - gpu_input[gpu_Mchanged1[i]].d[j])*(gpu_input[gpu_Mchanged1[index]].d[j] - gpu_input[gpu_Mchanged1[i]].d[j]);
            //          dis = (gpu_input[gpu_Mchanged1[index]].d1 - gpu_input[gpu_Mchanged1[i]].d1)*(gpu_input[gpu_Mchanged1[index]].d1 - gpu_input[gpu_Mchanged1[i]].d1)+(gpu_input[gpu_Mchanged1[index]].d2 - gpu_input[gpu_Mchanged1[i]].d2)*(gpu_input[gpu_Mchanged1[index]].d2 - gpu_input[gpu_Mchanged1[i]].d2);

            if(dis < disMin)
            {
                flag = true;
                disMin = dis;
                disMinIndex = i;
            }
        }

        if(flag)
            if(gpu_clusterSize[index] < gpu_clusterSize[disMinIndex])
            {
                gpu_Munchanged[gpu_Schanged1[index]] = gpu_Mchanged1[disMinIndex];
                gpu_Mchanged1[index] = gpu_Mchanged1[disMinIndex];

            }
    }
}



struct is_minus_one
{
    __host__ __device__
    bool operator()(const int x)
    {
        return(x == -1);
    }
};

typedef thrust::device_vector<int>::iterator dintiter;

int main(int argc, char** argv)
{
    //  int h = 100;
    using namespace std;
    thrust::host_vector<DataType> host_input;
    //  string dir = "/home/gaoy/cuda-workspace/DATA/input/dataMS/data_1.txt";
    string dir = "/home/gaoy/workspace/DATA/dataInput/gaussianDistribution_2500.txt";
    DataType gt;
    loadData(host_input,dir, gt);
    cudaEvent_t start,stop;
    float time;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    int loopTime = 100;
    float timeSum = 0;

    thrust::device_vector<DataType> device_input = host_input;  // Host端vector
    int N = device_input.size();
    int rN = N;
    int lastSize, thisSize;

    DataType *gpu_input;
    gpu_input = thrust::raw_pointer_cast(&device_input[0]);

    thrust::device_vector<DataType> device_msPoint;
    device_msPoint.resize(N);
    DataType *gpu_msPoint;

    thrust::device_vector<int> device_Sunchanged;
    device_Sunchanged.resize(N);
    int *gpu_Sunchanged;
    gpu_Sunchanged = thrust::raw_pointer_cast(&device_Sunchanged[0]);

    thrust::device_vector<int> device_Munchanged;
    device_Munchanged.resize(N);
    int *gpu_Munchanged;
    gpu_Munchanged = thrust::raw_pointer_cast(&device_Munchanged[0]);

    thrust::device_vector<int> device_Schanged1;
    device_Schanged1.resize(N);
    int *gpu_Schanged1;
    gpu_Schanged1 = thrust::raw_pointer_cast(&device_Schanged1[0]);

    thrust::device_vector<int> device_Mchanged1;
    device_Mchanged1.resize(N);
    int *gpu_Mchanged1;
    gpu_Mchanged1 = thrust::raw_pointer_cast(&device_Mchanged1[0]);

    thrust::pair<thrust::device_vector<int>::iterator, thrust::device_vector<int>::iterator> new_end;

    thrust::device_vector<int> device_clusterSize;

    initialSM<<<N,1>>>(gpu_Mchanged1, gpu_Schanged1,N);

    bool loop1st = true;
    dintiter Mend, Send, Cend;
    int *gpu_clusterSize;
    gpu_msPoint = thrust::raw_pointer_cast(&device_msPoint[0]);




    for(int i = 0; i < loopTime; i++)
    {

        cudaFree(0);
        cudaEventRecord(start,0);


        while(1)
        {
            lastSize = device_Schanged1.size();
            N = lastSize;
            device_msPoint.resize(N);

            parallelCal<<<N,1>>>(gpu_Schanged1,gpu_input, gpu_msPoint, N); //the size of the gpu_msPoint is as the same as the gpu_Mchanged1; but the gpu_input is the original data size
            device_Mchanged1.resize(N);
            nearestSearch<<<N,1>>>(gpu_Schanged1,gpu_Mchanged1,gpu_msPoint,gpu_input, N, gpu_Sunchanged, gpu_Munchanged);

            routineTransmission<<<N,1>>>(loop1st,gpu_Schanged1,gpu_Mchanged1,gpu_Sunchanged, gpu_Munchanged,N);


            thrust::sort_by_key(device_Mchanged1.begin(), device_Mchanged1.end(), device_Schanged1.begin());
            //
            new_end = thrust::unique_by_key(device_Mchanged1.begin(), device_Mchanged1.end(), device_Schanged1.begin());
            N = new_end.first - device_Mchanged1.begin();
            device_Mchanged1.resize(N);
            device_Schanged1.resize(N);

            device_clusterSize.clear();
            device_clusterSize.resize(N);

            gpu_clusterSize = thrust::raw_pointer_cast(&device_clusterSize[0]);
            checkTheClusterSize<<<N,1>>>(gpu_Mchanged1, gpu_Schanged1,gpu_Munchanged, gpu_clusterSize,N,rN);

            Mend = thrust::remove_if(device_Mchanged1.begin(), device_Mchanged1.end(), device_Schanged1.begin(),is_minus_one());
            Cend = thrust::remove_if(device_clusterSize.begin(), device_clusterSize.end(), device_Schanged1.begin(), is_minus_one());
            Send = thrust::remove(device_Schanged1.begin(), device_Schanged1.end(), -1);

            N =  Send - device_Schanged1.begin();
            device_Schanged1.resize(N);
            device_Mchanged1.resize(N);
            device_clusterSize.resize(N);
            mergeCentreSimple<<<N,1>>>(gpu_Munchanged,gpu_clusterSize, gpu_input, gpu_Schanged1, gpu_Mchanged1, N);
            thrust::sort_by_key(device_Mchanged1.begin(), device_Mchanged1.end(), device_Schanged1.begin());
            new_end = thrust::unique_by_key(device_Mchanged1.begin(), device_Mchanged1.end(), device_Schanged1.begin());
            N =  new_end.first - device_Mchanged1.begin();
            device_Mchanged1.resize(N);
            device_Schanged1.resize(N);


            thisSize = N;
            if(lastSize == thisSize)
                break;
            loop1st = false;

            thrust::copy(device_Mchanged1.begin(),device_Mchanged1.end(),device_Schanged1.begin());
            device_Mchanged1.clear();
            gpu_Schanged1 = thrust::raw_pointer_cast(&device_Schanged1[0]);
        }
        cudaEventRecord(stop,0);
        cudaEventSynchronize(stop);

        cudaEventElapsedTime(&time, start, stop);
        //      for(unsigned int ii = 0; ii < device_Mchanged1.size(); ii++)
        //          cout<<ii<<" "<<host_input[device_Schanged1[ii]].d[0]<<" "<<host_input[device_Schanged1[ii]].d[1]<<endl;

        timeSum += time;
        cout<<i<<" "<<time<<endl;
    }
    cout<<"elapsed: "<<timeSum/loopTime<<" ms"<<endl;









    return 0;


}

すべてのループの変数時間の出力は同じではなく、これは私が得た結果です:

しかし、私がテストするコードはいつも同じことをします。どの結果を信じるべきですか? 私はこれについて本当に困惑しています。ありがとう。

score 1 · Accepted Answer

実際のコードを投稿したので、タイミング方法論に明らかな問題は見られません。cudaFree(0);コードは最初のタイミングシーケンスのかなり前に既に cuda コンテキストを作成しているため、起動時間に関する記述は無関係です。

私が持っていないデータファイルに依存しているため、コードを実行することはできません。ただし、タイミングの変動の最も可能性の高い説明は、タイミングまたは実行中の作業に実際の変動があるということです。同じコードを実行しているように見えても、実行ごとに異なる時間がかかる場合があります。

これがどのようになるかのいくつかの例（これがあなたのコードに当てはまると言っているわけではありません。わかりません）：

Thrust::sort_by_key は、シーケンスが既にソートされている場合、ソートされていないシーケンスの場合とは異なる時間がかかります。インプレースソートを行っているため、既にソートされているデータをソートしているかどうかという疑問が生じます。タイミングループの最初のパスではデータが並べ替えられる場合がありますが、後続のパスでは既に並べ替えられたデータが並べ替えられるため、時間がかかりません。
別の例は、すべての.resize(N)操作です。ループを初めて通過すると、これらは実際のサイズ変更を行っている可能性がありますが、その後のパスでNは変更されない場合、実際のサイズ変更は行われないため、操作にかかる時間が短くなります。

繰り返しますが、これらの仮説のいずれかがあなたのコードに当てはまるかどうかはわかりません。場合によっては、同じコードシーケンスを繰り返し実行し、タイミングの変動を観察することが可能である可能性があることを指摘しているだけです。

コードは明らかに同一であるため、問題はデータを分析して、実行ごとに同じかどうかを確認することになります。興味深いテストは、命令が発生するwhile(1)前にループを通過した回数を追跡することです。break;これは、何が起こっているのかについても有益かもしれません。

cuda - cudaEvent を使用して、コードの実行時間を測定しました。しかし、私は何かパズルを見つけました

1 に答える 1

Related

Reference