python - 再帰層による奇妙な勾配の結果

Question

私は非常に基本的な再帰型ネットワークで実験を行っており、非常に奇妙な動作を見てきました。どこがうまくいかないかを絞り込むためにかなりの時間を費やしましたが、再帰層を使用すると、theano と有限微分によって計算される勾配が根本的に異なることがわかりました。ここで何が起こっているのですか？

これが私が抱えている問題の種類です：

n_class クラス間のラベルとともに、次元 n_feat の n_steps 特徴ベクトルの n_seq シーケンスがあります。ラベルは、シーケンスごとではなく、時間ステップごとです (したがって、n_seq*n_steps ラベルがあります)。私の目標は、モデルをトレーニングして、特徴ベクトルを正しく分類することです。

これが私の最小限の例です：

(実際には、データにはシーケンシャルな情報が含まれているため、再帰型ネットワークの方がうまくいくはずですが、この最小限の例では純粋にランダムなデータを生成します。これはバグを明らかにするのに十分です。)

2 つの最小限のネットワークを作成します。

1) 通常のフィードフォワード (再帰的ではない)。入力層とソフトマックスを含む出力層 (隠れ層なし) のみ。n_seq*n_steps の「独立した」特徴ベクトルの「バッチ」を考慮して、順次情報を破棄します。

2) 同一のネットワークですが、出力層は再帰的です。バッチのサイズは n_seq になり、各入力は n_steps 特徴ベクトルの完全なシーケンスになります。最後に、出力をサイズ n_seq*n_steps の「バッチ」に戻します。

再帰重みが 0 に設定されている場合、2 つのネットワークは同等である必要があります。実際、この場合、フィードフォワード重みのランダムな初期化に関係なく、両方のネットワークの初期損失が同じであることがわかります。有限微分を実装すると、フィードフォワードの重みの (初期の) 勾配が (本来あるべきように) 同じであることもわかります。ただし、theano から得られる勾配は根本的に異なります (ただし、リカレントネットワークの場合のみ)。

サンプル結果を含む私のコードは次のとおりです。

注：初めて実行すると、この警告が表示されます。何が原因かわかりませんが、私の問題に関連していると思います。警告: 厳密モードでは、必要なすべての共有変数を non_sequences の一部として渡す必要があります 'must be pass as a part of non_sequences', 警告)

どんな洞察も大歓迎です！

コード：

import numpy as np
import theano
import theano.tensor as T
import lasagne


# GENERATE RANDOM DATA
n_steps = 10**4
n_seq = 10
n_feat = 2
n_class = 2
data_X = lasagne.utils.floatX(np.random.randn(n_seq, n_steps, n_feat))
data_y = np.random.randint(n_class, size=(n_seq, n_steps))

# INITIALIZE WEIGHTS
# feed-forward weights (random)
W = theano.shared(lasagne.utils.floatX(np.random.randn(n_feat,n_class)), name="W")
# recurrent weights (set to 0)
W_rec = theano.shared(lasagne.utils.floatX(np.zeros((n_class,n_class))), name="Wrec")
# bias (set to 0)
b = theano.shared(lasagne.utils.floatX(np.zeros((n_class,))), name="b")



def create_functions(model, X, y, givens):
    """Helper for building a network."""
    loss = lasagne.objectives.categorical_crossentropy(lasagne.layers.get_output(model, X), y).mean()
    get_loss = theano.function(
        [], loss,
        givens=givens
    )
    all_params = lasagne.layers.get_all_params(model)
    get_theano_grad = [
        theano.function(
            [], g,
            givens=givens
        )
        for g in theano.grad(loss, all_params)
    ]
    return get_loss, get_theano_grad


def feedforward():
    """Creates a minimal feed-forward network."""
    l_in = lasagne.layers.InputLayer(
        shape=(n_seq*n_steps, n_feat),
    )
    l_out = lasagne.layers.DenseLayer(
        l_in,
        num_units=n_class,
        nonlinearity=lasagne.nonlinearities.softmax,
        W=W,
        b=b
    )
    model = l_out
    X = T.matrix('X')
    y = T.ivector('y')
    givens={
        X: theano.shared(data_X.reshape((n_seq*n_steps, n_feat))),
        y: T.cast(theano.shared(data_y.reshape((n_seq*n_steps,))), 'int32'),
    }
    return (model,) + create_functions(model, X, y, givens)


def recurrent():
    """Creates a minimal recurrent network."""
    l_in = lasagne.layers.InputLayer(
        shape=(n_seq, n_steps, n_feat),
    )
    l_out = lasagne.layers.RecurrentLayer(
        l_in,
        num_units=n_class,
        nonlinearity=lasagne.nonlinearities.softmax,
        gradient_steps=1,
        W_in_to_hid=W,
        W_hid_to_hid=W_rec,
        b=b,
    )
    l_reshape = lasagne.layers.ReshapeLayer(l_out, (n_seq*n_steps, n_class))
    model = l_reshape
    X = T.tensor3('X')
    y = T.ivector('y')
    givens={
        X: theano.shared(data_X),
        y: T.cast(theano.shared(data_y.reshape((n_seq*n_steps,))), 'int32'),
    }
    return (model,) + create_functions(model, X, y, givens)


def finite_diff(param, loss_func, epsilon):
    """Computes a finitie differentation gradient of loss_func wrt param.""" 
    loss = loss_func()
    P = param.get_value()
    grad = np.zeros_like(P)
    it = np.nditer(P , flags=['multi_index'])
    while not it.finished:
        ind = it.multi_index
        dP = P.copy()
        dP[ind] += epsilon
        param.set_value(dP)
        grad[ind] = (loss_func()-loss)/epsilon
        it.iternext()
    param.set_value(P)
    return grad


def theano_diff(net, get_theano_grad):
    for p,g in zip(lasagne.layers.get_all_params(net), get_theano_grad):
        if p.name == "W":
            gW = np.array(g())
        if p.name == "b":
            gb = np.array(g())
    return gW, gb


def compare_ff_rec():
    eps = 1e-3 # for finite differentiation
    ff, get_loss_ff, get_theano_grad_ff = feedforward()
    rec, get_loss_rec, get_theano_grad_rec = recurrent()
    gW_ff_finite = finite_diff(W, get_loss_ff, eps)
    gb_ff_finite = finite_diff(b, get_loss_ff, eps)
    gW_rec_finite = finite_diff(W, get_loss_rec, eps)
    gb_rec_finite = finite_diff(b, get_loss_rec, eps)
    gW_ff_theano, gb_ff_theano = theano_diff(ff, get_theano_grad_ff)
    gW_rec_theano, gb_rec_theano = theano_diff(rec, get_theano_grad_rec)
    print "\nloss:"
    print "FF:\t", get_loss_ff()
    print "REC:\t", get_loss_rec()
    print "\ngradients:"
    print "W"
    print "FF finite:\n", gW_ff_finite.ravel()
    print "FF theano:\n", gW_ff_theano.ravel()
    print "REC finite:\n", gW_rec_finite.ravel()
    print "REC theano:\n", gW_rec_theano.ravel()
    print "b"
    print "FF finite:\n", gb_ff_finite.ravel()
    print "FF theano:\n", gb_ff_theano.ravel()
    print "REC finite:\n", gb_rec_finite.ravel()
    print "REC theano:\n", gb_rec_theano.ravel()


compare_ff_rec()

結果：

loss:
FF:     0.968060314655
REC:    0.968060314655

gradients:
W
FF finite:
[ 0.23925304 -0.23907423  0.14013052 -0.14001131]
FF theano:
[ 0.23917811 -0.23917811  0.14011626 -0.14011627]
REC finite:
[ 0.23931265 -0.23907423  0.14024973 -0.14001131]
REC theano:
[  1.77408110e-05  -1.77408110e-05   1.21677476e-05  -1.21677458e-05]
b
FF finite:
[ 0.00065565 -0.00047684]
FF theano:
[ 0.00058145 -0.00058144]
REC finite:
[ 0.00071526 -0.00047684]
REC theano:
[  7.53380482e-06  -7.53380482e-06]

python - 再帰層による奇妙な勾配の結果

1 に答える 1

Related

Reference