python - GPU と Chianer による目的の収束率の違い

Question

Chainer1.22.0 を使用して LSTM 言語モデルを実装しています。私のコードは CPU では機能しますが、GPU では機能しません... より正確には、私のオブジェクト関数は CPU ではかなり高速に収束しますが、GPU では収束しません。何が起こっているかについて何か考えはありますか？

事前に助けてくれてありがとう！

出力:

$ python debug.py --gpu -1
cpu mode
objective in epoch  0 :  14.8049154282
objective in epoch  1 :  11.7126655579
objective in epoch  2 :  10.6166152954
objective in epoch  3 :  9.81489753723
objective in epoch  4 :  8.90626144409
objective in epoch  5 :  7.73007297516
objective in epoch  6 :  6.31889343262
objective in epoch  7 :  4.83179998398
objective in epoch  8 :  3.52315592766
objective in epoch  9 :  2.58598852158

$ python debug.py --gpu 0
gpu mode  0
objective in epoch  0 :  14.8049144745
objective in epoch  1 :  14.3081817627
objective in epoch  2 :  14.0404243469
objective in epoch  3 :  13.8618173599
objective in epoch  4 :  13.7236022949
objective in epoch  5 :  13.6082553864
objective in epoch  6 :  13.5111179352
objective in epoch  7 :  13.4323377609
objective in epoch  8 :  13.3735141754
objective in epoch  9 :  13.3361949921

環境:

Python 2.7.13 (アナコンダ)
Chainer1.22.0
Cuda8.0

私のモデル:

min -sum log P(s_t|s_{t<})
where P(s_t | s{t<}) = LSTM( s_{t<} )

完全なコード:

'''
this is a code for asking
'''
import numpy as np
try:
    import cupy as xp
except ImportError:
    pass
import sys
import chainer as ch
import chainer.links as L
import chainer.functions as F

INT = "int32"
FLOAT="float32"
BOOLEAN='bool'



class LSTM(ch.Chain):
    def __init__(self, voc_size, in_size, out_size, batch_size):
        np.random.seed(0)
        w1 = np.random.normal(size=[voc_size, in_size])

        super(LSTM, self).__init__(
            emb=L.EmbedID(voc_size, in_size, initialW =w1),  # word embedding
            enc = L.LSTM(in_size=in_size, out_size=out_size),# LSTM_cell
            scores = L.Linear(out_size, voc_size)            # output transformation
        )
        self.batch_size = batch_size
        self.out_size = out_size
        self.gpu_idx = -1

    #put links on GPU
    def to_gpu(self, device_idx):
        self.gpu_idx = device_idx

        self.emb.to_gpu(device_idx)
        self.scores.to_gpu(device_idx)
        self.enc.to_gpu(device_idx)


    def obj(self, seq):
        #object function is log likelyhood for the each word on the seq
        return -F.sum(self.logL(seq))


    def logL(self, seq):
        '''
        seq ; batch of src seq of length T : List<List<int>>
        RETRUN : R^{batch_size x T} : CP node
        '''
        T = xp if self.gpu_idx>=0 else np
        padded = T.transpose(T.array(seq, dtype=INT)) #Z^{T x batch_size}

        #reset LSTM cell
        self.enc.reset_state()

        logL = []
        #logL for each time step except the first input
        for i in range(0, len(padded)-1):
            #get LSTM output
            h = self.enc(self.emb(padded[i])) #R^{batch_size x hidden_size}

            #probability distribution over vocabrary
            s = self.scores(F.tanh(h))        #R^{batch_size x voc_size}
            s = F.transpose(F.log_softmax(s)) #R^{voc_size x batch_size}

            #likelyhood for the next word
            l = F.embed_id(padded[i+1] , s)     #R^{batch_size x batch_size}
            l = F.sum(l * T.identity(self.batch_size), axis=0) #R^{batch_size}
            logL += [l]

        return F.transpose(F.stack(logL))




GPU_TAG = "--gpu"
if __name__=="__main__":
    args= sys.argv
    gpu_idx = -1
    i=0

    #argument
    while i<len(args):
        if args[i]==GPU_TAG:
            i+=1
            gpu_idx = int(args[i])
        i+=1

    #hyper paramters
    voc_size = 5
    batch_size=3
    in_size = 5
    out_size=2

    #instanciate model
    model = LSTM(voc_size, in_size, out_size, batch_size)

    #GPU mode or CPU mode
    if gpu_idx>=0:
        print "gpu mode ", gpu_idx
        ch.cuda.get_device(gpu_idx).use()
        model.to_gpu(gpu_idx)
    else:
        print "cpu mode"

    #prepare optimizer
    trainer = ch.optimizers.sgd.SGD(lr=0.3)
    trainer.setup(model)

    #seq to train
    x = [[1,2,3,4]]*batch_size

    #main training loop
    for epoch in range(10):
        obj = model.obj(x)  #forward path
        model.cleargrads()  #init grad for backward path
        obj.backward()      #backward path
        print "objective in epoch ",epoch, ": ", obj.data
        trainer.update()    #update

python - GPU と Chianer による目的の収束率の違い

出力:

環境:

私のモデル:

完全なコード:

1 に答える 1

Related

Reference