以下の解決策:
シナリオ:
ユーザー定義関数のヤコビアンをループで何度も計算しようとしています。TF 2 の GradientTape と、古いセッション ベースの tf.gradients() メソッドでこれを行うことができます。問題は、GradientTape が tf.gradients() よりも非常に遅い (100 倍遅い) ことです。使いたい機能 (bath_jacobian、ヘッセ行列のサポートなど) がありますが、100 倍遅い場合は使用できません。
質問:
私が単にGradientTapeを誤用しているのか、それとも呼び出されるたびに提供された関数を再微分する必要があるため常に遅くなるのかは明らかではありません(私の疑い)。GradientTape の使用を修正するためのヒント、または常に tf.gradients よりも桁違いに基本的に遅くなるという確認を求めています。
関連する質問:
- 複数のヤコビアン計算に対する GradientTape の繰り返し使用- 同じシナリオ、未回答
- `GradientTape` は導関数の各評価を再微分する必要がありますか? - 同じシナリオ、未回答
- グローバル コンテキストで 1 つの GradientTape を使用する- 緩やかに関連し、そのソリューションを私のシナリオに適用するのに問題がある
GradientTape と tf.gradients() を比較するための完全に含まれる最小限の例:
import tensorflow as tf
from tensorflow.python.framework.ops import disable_eager_execution
import numpy as np
# from tensorflow.python.ops.parallel_for.gradients import jacobian, batch_jacobian
import timeit
class FunctionCaller(object):
def __init__(self, func, nX, dtype=tf.float64, useSessions=True):
if useSessions:
disable_eager_execution()
self.func = func
self.nX = nX
self.useSessions = useSessions
self.dtype = dtype
self.sess = tf.compat.v1.Session() if useSessions else None
if not useSessions:
return
#
# we are in session mode, so build the graph and take the batch-jacobian of the function's outputs
#
xTensor = tf.compat.v1.placeholder(dtype, shape=[None, nX])
# add function to graph and guarantee its output shape
func_tensor = tf.reshape(func(xTensor), [-1, nX])
# take the gradient for each output, one at a time, and stack the results back together
each_output = tf.unstack(func_tensor, nX, axis=1)
jac_x = tf.stack([tf.gradients(output, xTensor, unconnected_gradients='zero')[0]
for output in each_output], axis=1)
# record these tensors so we can use them later with session.run()
self.xTensor = xTensor
self.func_tensor = func_tensor
self.jac_func_tensor = jac_x
def jac(self, x_i):
if self.useSessions:
return self.sess.run(self.jac_func_tensor, {self.xTensor: x_i})
else:
return self._useGradientTape(x_i)
# THIS FUNCTION IS SUPER INEFFICIENT.
def _useGradientTape(self, x_i):
with tf.GradientTape(persistent=True) as g:
xTensor = tf.Variable(x_i, dtype=self.dtype) # is this my problem??? i recreate x every time?
y = tf.reshape(self.func(xTensor), [-1, self.nX])
jac_x_at_i = g.batch_jacobian(y, xTensor)
# del g
return jac_x_at_i.numpy()
def __del__(self):
if self.sess is not None:
self.sess.close()
def main():
@tf.function
def Xdot(x_i):
x_0, x_1, x_2 = tf.split(x_i, 3, axis=1)
return tf.concat([x_2 * tf.sin(x_2), x_2 * tf.cos(x_2), x_2], axis=1)
nT = 20
nX = 3
# create some trash data
x_i = np.arange(nT*nX).reshape([-1, nX])
nTrials = 100
# try the eager version first
caller_eager = FunctionCaller(Xdot, nX, useSessions=False)
start_time = timeit.default_timer()
for _ in range(nTrials):
jac_eager = caller_eager.jac(x_i)
elapsed = timeit.default_timer() - start_time
print("eager code took {} sec: {} sec/trial".format(elapsed, elapsed/nTrials))
# now try the sessions version
caller_sessions = FunctionCaller(Xdot, nX, useSessions=True)
start_time = timeit.default_timer()
caller_sessions.jac(x_i) # call it once to do its graph building stuff?
for _ in range(nTrials):
jac_session = caller_sessions.jac(x_i)
elapsed = timeit.default_timer() - start_time
print("session code took {} sec: {} sec/trial".format(elapsed, elapsed/nTrials))
residual = np.max(np.abs(jac_eager - jac_session))
print('residual between eager and session trials is {}'.format(residual))
if __name__ == "__main__":
main()
編集 - 解決策:
xdurch0 は、@tf.function で _useGradientTape() をラップする必要があることを以下に指摘しました。これは、他の理由で以前に失敗したものです。これを行ったら、xTensor の定義を @tf.function ラッパーの外に移動する必要がありました。それには、メンバー変数を作成し、tf.assign() を使用します。
これがすべて完了すると、GradientTape (この単純な例の場合) が tf.gradints と同じ大きさになっていることがわかります。十分な試行 (~1E5) を実行すると、tf.gradients の 2 倍の速度になります。驚くばかり!
import tensorflow as tf
from tensorflow.python.framework.ops import disable_eager_execution
import numpy as np
import timeit
class FunctionCaller(object):
def __init__(self, func, nT, nX, dtype=tf.float64, useSessions=True):
if useSessions:
disable_eager_execution()
self.func = func
self.nX = nX
self.useSessions = useSessions
self.dtype = dtype
self.sess = tf.compat.v1.Session() if useSessions else None
if not useSessions:
# you should be able to create without an initial value, but tf is demanding one
# despite what the docs say. bug?
# tf.Variable(initial_value=None, shape=[None, nX], validate_shape=False, dtype=self.dtype)
self.xTensor = tf.Variable([[0]*nX]*nT, dtype=self.dtype) # x needs to be properly sized once
return
#
# we are in session mode, so build the graph and take the batch-jacobian of the function's outputs
#
xTensor = tf.compat.v1.placeholder(dtype, shape=[None, nX])
# add function to graph and guarantee its output shape
func_tensor = tf.reshape(func(xTensor), [-1, nX])
# take the gradient for each output, one at a time, and stack the results back together
each_output = tf.unstack(func_tensor, nX, axis=1)
jac_x = tf.stack([tf.gradients(output, xTensor, unconnected_gradients='zero')[0]
for output in each_output], axis=1)
# record these tensors so we can use them later with session.run()
self.xTensor = xTensor
self.func_tensor = func_tensor
self.jac_func_tensor = jac_x
def jac(self, x_i):
if self.useSessions:
return self.sess.run(self.jac_func_tensor, {self.xTensor: x_i})
else:
return self._useGradientTape(x_i).numpy()
@tf.function # THIS IS CRUCIAL
def _useGradientTape(self, x_i):
with tf.GradientTape(persistent=True) as g:
self.xTensor.assign(x_i) # you need to create the variable once outside the graph
y = tf.reshape(self.func(self.xTensor), [-1, self.nX])
jac_x_at_i = g.batch_jacobian(y, self.xTensor)
# del g
return jac_x_at_i
def __del__(self):
if self.sess is not None:
self.sess.close()
def main():
@tf.function
def Xdot(x_i):
x_0, x_1, x_2 = tf.split(x_i, 3, axis=1)
return tf.concat([x_2 * tf.sin(x_2), x_2 * tf.cos(x_2), x_2], axis=1)
nT = 20
nX = 3
# create some trash data
x_i = np.random.random([nT, nX])
nTrials = 1000 # i find that nTrials<=1E3, eager is slower, it's faster for >=1E4, it's TWICE as fast for >=1E5
# try the eager version first
caller_eager = FunctionCaller(Xdot, nT, nX, useSessions=False)
start_time = timeit.default_timer()
for _ in range(nTrials):
jac_eager = caller_eager.jac(x_i)
elapsed = timeit.default_timer() - start_time
print("eager code took {} sec: {} sec/trial".format(elapsed, elapsed/nTrials))
# now try the sessions version
caller_sessions = FunctionCaller(Xdot, nT, nX, useSessions=True)
start_time = timeit.default_timer()
for _ in range(nTrials):
jac_session = caller_sessions.jac(x_i)
elapsed = timeit.default_timer() - start_time
print("session code took {} sec: {} sec/trial".format(elapsed, elapsed/nTrials))
residual = np.max(np.abs(jac_eager - jac_session))
print('residual between eager and session trials is {}'.format(residual))
if __name__ == "__main__":
main()