c++ - カスタム op の CPU 実装が選択されているのはなぜですか?

Question

カスタム TensorFlow op の書き方を学ぶために、新しい Op の追加のbチュートリアルに従い、すべての入力値にスカラーを追加する「add_b」op を作成しました。

add_b_op.cc:

#define EIGEN_USE_THREADS

#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"

#include "tensorflow/core/framework/common_shape_fns.h"
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/shape_inference.h"

using namespace tensorflow;

REGISTER_OP("AddB")
    .Attr("T: {float, double}")
    .Input("input: T")
    .Input("b: T")
    .Output("output: T")
    .SetShapeFn([] (shape_inference::InferenceContext* c) -> Status {
      shape_inference::ShapeHandle out;
      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &out));
      return shape_inference::UnchangedShape(c);
    })
//----------------------------------------------------------------------
    .Doc(R"doc(
Adds `b` to each input.

input: The input values.
b: A number to add to each input value.
)doc");


template <typename T>
class AddBCpuOp : public OpKernel {
 public:
  explicit AddBCpuOp(OpKernelConstruction* context) : OpKernel(context) {}

  void Compute(OpKernelContext* context) override {
    const Tensor& input_tensor = context->input(0);
    const auto input = input_tensor.flat<T>();

    Tensor* output_tensor = nullptr;
    OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
                                                     &output_tensor));
    auto output = output_tensor->flat<T>();

    const Eigen::ThreadPoolDevice& d = context->eigen_device<Eigen::ThreadPoolDevice>();

    // Note: The mistake of adding 1 instead of `b` is intentional to be able to distinguish
    // the CPU and GPU implementations.
    output.device(d) = input + static_cast<T>(1);
  }
};

REGISTER_KERNEL_BUILDER(
    Name("AddB")
    .Device(DEVICE_CPU)
    .TypeConstraint<float>("T"),
    AddBCpuOp<float>);
REGISTER_KERNEL_BUILDER(
    Name("AddB")
    .Device(DEVICE_CPU)
    .TypeConstraint<double>("T"),
    AddBCpuOp<double>);


#if GOOGLE_CUDA

template <typename T>
bool LaunchAddBKernel(const T *__restrict__ d_input, int n, const T *__restrict__ d_b, T *__restrict__ d_output);

template <typename T>
class AddBGpuOp : public OpKernel {
 public:
  explicit AddBGpuOp(OpKernelConstruction* context) : OpKernel(context) {}

  void Compute(OpKernelContext* context) override {
    const Tensor& input_tensor = context->input(0);
    const auto input = input_tensor.flat<T>();

    const Tensor& b_tensor = context->input(1);
    OP_REQUIRES(context, TensorShapeUtils::IsScalar(b_tensor.shape()),
                errors::InvalidArgument("add_b expects a scalar for `b`."));
    const auto b = b_tensor.scalar<T>();

    Tensor* output_tensor = nullptr;
    OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
                                                     &output_tensor));
    auto output = output_tensor->flat<T>();

    OP_REQUIRES(context, LaunchAddBKernel(input.data(), input.dimension(0), b.data(), output.data()),
                errors::Internal("add_b: LaunchAddBKernel() failed."));
  }
};

REGISTER_KERNEL_BUILDER(
    Name("AddB")
    .Device(DEVICE_GPU)
    .TypeConstraint<float>("T"),
    AddBGpuOp<float>);
REGISTER_KERNEL_BUILDER(
    Name("AddB")
    .Device(DEVICE_GPU)
    .TypeConstraint<double>("T"),
    AddBGpuOp<double>);

#endif // if GOOGLE_CUDA

add_b_op.cu.cc

template <typename T, int BLOCK_DIM_X>
__global__ void AddBKernel(const T *__restrict__ d_input, int n, const T *__restrict__ d_b, T *__restrict__ d_output) {
  const int i = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
  if (i < n) {
    d_output[i] = d_input[i] + *d_b;
  }
}

template <typename T>
bool LaunchAddBKernel(const T *__restrict__ d_input, int n, const T *__restrict__ d_b, T *__restrict__ d_output) {
  if (n <= 0) return true;

  constexpr int BLOCK_DIM_X = 256;
  AddBKernel<T, BLOCK_DIM_X><<<n / BLOCK_DIM_X + (n % BLOCK_DIM_X != 0), BLOCK_DIM_X>>>(d_input, n, d_b, d_output);
  return true;
}

// Explicit instantiations.
template bool LaunchAddBKernel<float>(const float *__restrict__, int, const float *__restrict__, float *__restrict__);
template bool LaunchAddBKernel<double>(const double *__restrict__, int, const double *__restrict__, double *__restrict__);

CPU または GPU 実装が使用されているかどうかを区別できるように、意図的に CPU 実装にエラーを導入しました。

カスタム op を次のようにテストすると、次のようになります。

from __future__ import print_function
import tensorflow as tf

module = tf.load_op_library('custom_ops.so')
with tf.Session(config = tf.ConfigProto(log_device_placement = True)):
  print(module.add_b([5., 4., 3., 2., 1.], 8.).eval())

次の出力が得られます。

I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:892] OS X は NUMA をサポートしていません - NUMA ノード 0 を返します
I tensorflow/core/common_runtime/gpu/gpu_device.cc:951] プロパティを持つデバイス 0 が見つかりました:
名前: GeForce GT 750M
メジャー: 3 マイナー: 0 memoryClockRate (GHz) 0.9255
pciBusID 0000:01:00.0
合計メモリ: 2.00GiB
空きメモリ: 1.80GiB
私は tensorflow/core/common_runtime/gpu/gpu_device.cc:972] DMA: 0
私は tensorflow/core/common_runtime/gpu/gpu_device.cc:982] 0: Y
I tensorflow/core/common_runtime/gpu/gpu_device.cc:1041] TensorFlow デバイスの作成 (/gpu:0) -> (デバイス: 0、名前: GeForce GT 750M、pci バス ID: 0000:01:00.0)
デバイス マッピング:
/job:localhost/replica:0/task:0/gpu:0 -> デバイス: 0、名前: GeForce GT 750M、pci バス ID: 0000:01:00.0
I tensorflow/core/common_runtime/direct_session.cc:252] デバイス マッピング:
/job:localhost/replica:0/task:0/gpu:0 -> デバイス: 0、名前: GeForce GT 750M、pci バス ID: 0000:01:00.0

AddB: /job:localhost/replica:0/task:0/gpu:0
I tensorflow/core/common_runtime/simple_placer.cc:819] AddB: /job:localhost/replica:0/task:0/gpu:0
AddB/b: /job:localhost/replica:0/task:0/gpu:0
I tensorflow/core/common_runtime/simple_placer.cc:819] AddB/b: /job:localhost/replica:0/task:0/gpu:0
AddB/入力: /job:localhost/replica:0/task:0/gpu:0
I tensorflow/core/common_runtime/simple_placer.cc:819] AddB/input: /job:localhost/replica:0/task:0/gpu:0
[ 6. 5. 4. 3. 2. ]

「デバイス配置ログ」は GPU で op が実行されていることを示しているように見えますが、出力は CPU 実装が使用されていることを示しています。

DEVICE_CPU実装、再コンパイル、および再テストの 2 つの REGISTER_KERNEL_BUILDER() 登録をコメントアウトすると、期待どおりの出力が得られ[ 13. 12. 11. 10. 9.]ますが、エラーが発生します。

E tensorflow/core/common_runtime/executor.cc:334] Executor はカーネルの作成に失敗しました。見つかりません: ノードと互換性のある CPU デバイスの 'AddB' OpKernel が登録されていません AddB = AddB[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/gpu:0"](AddB/input, AddB /b)
    . 登録済み: device='GPU'; T in [DT_FLOAT]
  device='GPU'; T in [DT_DOUBLE]

     [[ノード: AddB = AddB[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/gpu:0"](AddB/input, AddB/b)]]

このエラーメッセージはバグのように見えます。エラーには「エグゼキュータがカーネルを作成できませんでした」と表示されますが、GPU で op を実行するためにカーネルが作成されたようです。

GPU 実装ではなく CPU 実装が使用されているのはなぜですか?

これが重要な場合に備えて、私の開発セットアップの詳細を以下に示します。

NVIDIA GeForce GT 750M (CUDA Compute Capability 3.0) を内蔵した MacBook Pro を使用しています。
macOS シエラバージョン 10.12.1 (16B2555)
cuda_8.0.47_mac、cudnn-8.0-osx-x64-v5.1
TensorFlow 0.11.0rc2 のインストール:export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc2-py2-none-any.whl

更新CPU または GPU の実装が選択されているかどうかは、入力のサイズに依存することがわかりました。このテストスクリプトを使用すると、次のようになります。

from __future__ import print_function
import numpy as np
import tensorflow as tf
from time import time

NUM_VALUES = 1310720

input = np.arange(0, NUM_VALUES, dtype = float)

module = tf.load_op_library('custom_ops.so')
with tf.Session(config = tf.ConfigProto(log_device_placement = True)):
  start = time(); print(module.add_b(input, 8.).eval()); end = time(); print(end - start)

..NUM_VALUESが 1310720 以下の場合、CPU 実装が使用されます。NUM_VALUESが 1310721 以上の場合、GPU 実装が使用されます。

(1310720 * double あたり 8 バイト = ) 10 MiB カットオフはありますか? もしそうなら、どうすればこれをオーバーライドできますか? AddB() op は十分に単純ですが、より複雑なカスタム操作の場合、10 MiB は GPU 実装を選択するにはしきい値が大きすぎる可能性があります。

score 3 · Accepted Answer

TensorFlow issue #2054 - CPU と GPU の両方の実装を使用したカスタムオペレーターの GPU への手動配置は、常に CPU バージョンを実行し、CPU 実装を実行する動作は、「コンスタントフォールディング」と呼ばれる TensorFlow の機能のようです。TensorFlow が最初の実行の前にグラフを最適化するとき、CPU と GPU の実装は同じ結果を生成するはずであると考えられているため、定数を含む操作は一般に CPU で評価されます。理にかなっています。

この動作を無効にする 2 つの方法は次のとおりです。

グラフの最適化を無効にする:

from __future__ import print_function
import numpy as np
import tensorflow as tf
from time import time

NUM_VALUES = 10

input = np.arange(0, NUM_VALUES, dtype = float)

custom_ops_module = tf.load_op_library('custom_ops.so')

config = tf.ConfigProto(log_device_placement = True)
config.graph_options.optimizer_options.opt_level = -1

with tf.Session(config = config):
  start = time(); print(custom_ops_module.add_b(input, 8.).eval()); end = time(); print(end - start)

たとえば、値をプレースホルダーに入力して、定数を使用しない:

from __future__ import print_function
import numpy as np
import tensorflow as tf
from time import time

NUM_VALUES = 10

custom_ops_module = tf.load_op_library('custom_ops.so')

graph = tf.Graph()
with graph.as_default():
  input = tf.placeholder(tf.float64, shape = (NUM_VALUES,))
  b = tf.placeholder(tf.float64, shape = ())
  result = custom_ops_module.add_b(input, b)

with tf.Session(graph = graph, config = tf.ConfigProto(log_device_placement = True)) as session:
  feed_dict = {
    input: np.arange(0, NUM_VALUES, dtype = float),
    b: 8.,
  }
  start = time(); print(session.run([result], feed_dict = feed_dict)); end = time(); print(end - start)

score 2 · Accepted Answer

テンプレートのインスタンス化が間違っている可能性があると思います:

template <typename Device, typename T>
class AddBOp : public OpKernel {
...
}

REGISTER_KERNEL_BUILDER(
    Name("AddB")
    .Device(DEVICE_CPU)
    .TypeConstraint<float>("T"),
    AddBOp<CPUDevice, float>);

その後：

template <typename T>
class AddBOp<GPUDevice, T> : public OpKernel {
...
}

REGISTER_KERNEL_BUILDER(
    Name("AddB")
    .Device(DEVICE_GPU)
    .TypeConstraint<float>("T"),
    AddBOp<GPUDevice, float>);

GPU の AddB の登録は、2 番目の実装ではなく、最初の実装に一致するオブジェクトをインスタンス化すると思います (最初の実装には 2 つのテンプレート引数があり、2 番目の実装には 1 つのテンプレート引数があります)。

おそらく、2 番目の登録で AddBOp < float > を呼び出すことでこれを修正できますが、混乱を避けるために、より適切な名前をお勧めします。

c++ - カスタム op の CPU 実装が選択されているのはなぜですか?

3 に答える 3

Related

Reference