1

以下は、Ubuntu 12.04 (Precise) を搭載したラップトップで bumblebee と optirun を使用して実行した Nvidia CUDA の rootbeer サンプル コードです。ラップトップには Nvidia Optimus が搭載されているため、optirun が搭載されています。GPU はたまたま Nvidia GeForce GT 540M で、Nvidia の Web サイトによると 96 コアです。スループットはほとんど向上しません。何が問題ですか?

package com.random.test;

import java.util.ArrayList;
import java.util.Formatter;
import java.util.List;

import edu.syr.pcpratts.rootbeer.runtime.Kernel;
import edu.syr.pcpratts.rootbeer.runtime.Rootbeer;

public class ArraySumApp {
    final static int numberOfJobs = 1024; // 1024 in the original example
    final static int sizeOfArray = 512; // 512 in the original example
    final static int theAnswer = 130816;

    public int[] sumArrays(List<int[]> arrays) {

        List<Kernel> jobs = new ArrayList<Kernel>();
        int[] ret = new int[arrays.size()];
        for (int i = 0; i < arrays.size(); ++i) {
            jobs.add(new ArraySum(arrays.get(i), ret, i));
        }

        Rootbeer rootbeer = new Rootbeer();
        rootbeer.runAll(jobs);
        return ret;
    }

    private static long measureOneJob() {

        int[] source = new int[ArraySumApp.sizeOfArray];
        int[] destination = new int[1];
        for (int i = 0; i < ArraySumApp.sizeOfArray; i++)
            source[i] = i;
        Kernel job = new ArraySum(source, destination, 0);

        ElapsedTimer et = new ElapsedTimer();
        job.gpuMethod();
        long timeInMs = et.stopInMilliseconds();
        System.out.println("measureOneJob " + et.stringInMilliseconds());

        assert destination[0] == ArraySumApp.theAnswer : "cosmic rays";
        return timeInMs;
    }

    public static void main(String[] args) {

        Helper.assertAssertionEnabled();

        // measure the time to do one job
        ArraySumApp.measureOneJob();
        long oneJob = ArraySumApp.measureOneJob();

        ArraySumApp app = new ArraySumApp();
        List<int[]> arrays = new ArrayList<int[]>();

        // you want 1000s of threads to run on the GPU all at once for speedups
        for (int i = 0; i < ArraySumApp.numberOfJobs; ++i) {
            int[] array = new int[ArraySumApp.sizeOfArray];
            for (int j = 0; j < array.length; ++j) {
                array[j] = j;
            }
            arrays.add(array);
        }

        ElapsedTimer et = new ElapsedTimer();
        int[] sums = app.sumArrays(arrays);
        long allJobs = et.stopInMilliseconds();
        System.out.println("measureAllJobs " + et.stringInMilliseconds());

        double gainFactor = ((double) ArraySumApp.numberOfJobs) * oneJob
                / allJobs;
        System.out.println(String.format(
                "throughput gain factor %.1f\nthroughput gain %.1f\n",
                gainFactor, gainFactor - 1.0d));

        // check the number of answers is correct
        assert sums.length == ArraySumApp.numberOfJobs : "cosmic rays";

        // check they all have the answer
        for (int i = 0; i < ArraySumApp.numberOfJobs; i++)
            assert sums[i] == ArraySumApp.theAnswer : "cosmic rays";
    }
}

class ArraySum implements Kernel {

    final static int repetitionFactor = 100000;

    private int[] source;
    private int[] ret;
    private int index;

    public ArraySum(int[] src, int[] dst, int i) {
        source = src;
        ret = dst;
        index = i;
    }

    public void gpuMethod() {
        for (int repetition = 0; repetition < ArraySum.repetitionFactor; repetition++) {
            int sum = 0;
            for (int i = 0; i < source.length; ++i) {
                sum += source[i];
            }
            ret[index] = sum;
        }
    }
}

class Helper {
    private Helper() {
    }

    static void assertAssertionEnabled() {
        try {
            assert false;
        } catch (AssertionError e) {
            return;
        }
        Helper.noteCosmicRays();
    }

    static void noteCosmicRays() // programmer design or logic error
    {
        throw new RuntimeException("cosmic rays");
    }
}

class ElapsedTimer {
    private org.joda.time.DateTime t0;
    private long savedStopInMilliseconds;

    public ElapsedTimer() {
        this.t0 = new org.joda.time.DateTime();
    }

    public long stopInMilliseconds() {
        return stop();
    }

    public String stringInMilliseconds() // relies on a saved stop
    {
        Formatter f = new Formatter();
        f.format("%d ms", this.savedStopInMilliseconds);
        String s = f.toString();
        f.close();
        return s;
    }

    public String stopStringInMilliseconds() {
        stop();
        return stringInMilliseconds();
    }

    public String stringInSecondsAndMilliseconds() // relies on a saved stop
    {
        Formatter f = new Formatter();
        f.format("%5.3f s", this.savedStopInMilliseconds / 1000.0d);
        String s = f.toString();
        f.close();
        return s;
    }

    public String stopStringInSecondsAndMilliseconds() {
        stop();
        return stringInSecondsAndMilliseconds();
    }

    public long stopInSeconds() {
        return (stop() + 500L) / 1000L; // rounding
    }

    public String stringInSeconds() // relies on a saved stop
    {
        Formatter f = new Formatter();
        long elapsed = (this.savedStopInMilliseconds + 500L) / 1000L; // rounding
        f.format("%d s", elapsed);
        String s = f.toString();
        f.close();
        return s;
    }

    public String stopStringInSeconds() {
        stop();
        return stringInSeconds();
    }

    /**
     * This is private. Use the stopInMilliseconds method if this is what you
     * need.
     */
    private long stop() {
        org.joda.time.DateTime t1 = new org.joda.time.DateTime();
        savedStopInMilliseconds = t1.getMillis() - this.t0.getMillis();
        return savedStopInMilliseconds;
    }
}

これは出力です:

measureOneJob 110 ms
measureOneJob 26 ms
CudaRuntime2 ctor: elapsedTimeMillis: 609
measureAllJobs 24341 ms
throughput gain factor 1.1
throughput gain 0.1
4

2 に答える 2

1

ルートビアの開発者は、配列要素の合計をとるサンプルコードは最良の例ではなく、別の例でスループットの向上が示されると述べました。

于 2012-12-11T08:58:20.467 に答える