0

以前にスタックでこの問題を見たことがありますが、解決策はうまくいきませんでした。Save and Load testing classify Naive Bayes Classifier in NLTK in another method を参照してください。同じプログラムでトレーニングと分類を行うのとは対照的に、ピクルス化された分類子をロードすると、なぜ精度が大きく異なるのか、私は困惑しています。最初のコード ブロックは pickled 分類器を呼び出しており、2 番目のコード ブロックはすべてのトレーニングと分類を一緒に行っています。2 番目の方法では 99% の精度が得られ、最初の方法では 81% の精度が得られます...

Academic_classifier= None
Academic_classifier=pickle.load(open('Academic_classifier.pickle','rb'))
tweets=[]

readdata=csv.reader(open('C:\Users\Troy\Documents\Data\Gold_test.csv','r'))
for row in readdata:
    tweets.append(row)
Header = tweets[0]
tweets.pop(0)
Academic_test_tweets=tweets[:]
Tweets=[]
for (words, sentiment) in tweets:
    bigram=[]
    bigram_list=[]
    words_filtered = [e.lower() for e in WordPunctTokenizer().tokenize(words) if len(e) >= 3]
    words_filtered=[re.sub(r'(.)\1+', r'\1\1', e) for e in words_filtered if len(e)>=3]
    bigram_words=bigrams(words_filtered)
    for x in bigram_words:
        bigram.append(x)
    for (bi) in bigram:
        bigram_word=bi[0]+bi[1]
        bigram_list.append(bigram_word)
    list_to_append=words_filtered+bigram_list
    Tweets.append((list_to_append, sentiment))
Academic_test_tweets_words=Tweets[:]


word_features = get_word_features(get_words_in_tweets(Academic_test_tweets_words))
Academic_test_set = nltk.classify.apply_features(extract_features,Academic_test_tweets_words)

print(nltk.classify.accuracy(Academic_classifier, Academic_test_set), 'tweet corpus used in academic paper Sentiment Analysis on the Social Networks Using Stream Algorithms Authors: Nathan Aston, Timothy Munson, Jacob Liddle, Garrett Hartshaw, Dane Livingston, Wei Hu   *compare to their accuracy of 87.5%')

このコードとは対照的に、精度をトレーニングしてテストします。私はすべてに同じ定義を使用しているので、問題が定義にないことはわかっています。唯一の違いは、漬物の分類器です...何が起こっているのでしょうか?

tweets=[]

readdata=csv.reader(open('C:\Users\Troy\Documents\Data\Gold_test.csv','r'))
for row in readdata:
    tweets.append(row)
Header = tweets[0]
tweets.pop(0)
Academic_test_tweets=tweets[:]
Tweets=[]
for (words, sentiment) in tweets:
    bigram=[]
    bigram_list=[]
    words_filtered = [e.lower() for e in WordPunctTokenizer().tokenize(words) if len(e) >= 3]
    words_filtered=[re.sub(r'(.)\1+', r'\1\1', e) for e in words_filtered if len(e)>=3]
    bigram_words=bigrams(words_filtered)
    for x in bigram_words:
        bigram.append(x)
    for (bi) in bigram:
        bigram_word=bi[0]+bi[1]
        bigram_list.append(bigram_word)
    list_to_append=words_filtered+bigram_list
    Tweets.append((list_to_append, sentiment))
Academic_test_tweets_words=Tweets[:]


word_features = get_word_features(get_words_in_tweets(Academic_test_tweets_words))
Academic_test_set = nltk.classify.apply_features(extract_features,Academic_test_tweets_words)





tweets=[]
readdata=csv.reader(open('C:\Users\Troy\Documents\Data\Gold_train.csv','r'))
for row in readdata:
    tweets.append(row)
Header = tweets[0]
tweets.pop(0)
AcademicTweets=tweets[:]
Tweets=[]
for (words, sentiment) in tweets:
    bigram=[]
    bigram_list=[]
    words_filtered = [e.lower() for e in WordPunctTokenizer().tokenize(words) if len(e) >= 3]
    words_filtered=[re.sub(r'(.)\1+', r'\1\1', e) for e in words_filtered if len(e)>=3]
    bigram_words=bigrams(words_filtered)
    for x in bigram_words:
        bigram.append(x)
    for (bi) in bigram:
        bigram_word=bi[0]+bi[1]
        bigram_list.append(bigram_word)
    list_to_append=words_filtered+bigram_list
    Tweets.append((list_to_append, sentiment))
AcademicWords=Tweets[:]

word_features = get_word_features(get_words_in_tweets(AcademicWords))
Academic_training_set = nltk.classify.apply_features(extract_features,AcademicWords)
Academic_classifier = nltk.NaiveBayesClassifier.train(Academic_training_set)
#Negative_classifier.show_most_informative_features(1)
print(nltk.classify.accuracy(Academic_classifier, Academic_test_set), 'tweet corpus used in academic paper Sentiment Analysis on the Social Networks Using Stream Algorithms Authors: Nathan Aston, Timothy Munson, Jacob Liddle, Garrett Hartshaw, Dane Livingston, Wei Hu   *compare to their accuracy of 87.5%')


pickle.dump(Academic_classifier, open('Academic_classifier.pickle','wb'))
4

0 に答える 0