python - 単純ベイズ分類器エラー

Question

ちょっと、単純ベイズ分類器を使用してテキストを分類しようとしています。私はNLTKを使用しています。classify() メソッドを使用して分類器をテストすると、常に最初の項目の正しい分類が返され、分類するテキストの他のすべての行に対して同じ分類が返されます。以下は私のコードです：

from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize
import nltk
import random
import nltk.data

documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = all_words.keys()[:2000] 

def bag_of_words(words):
    return dict([word,True] for word in words)

def document_features(document): 
    document_words = set(document) 
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

text1="i love this city"
text2="i hate this city"


feats1=bag_of_words(word_tokenize(text1))
feats2=bag_of_words(word_tokenize(text2))


print classifier.classify(feats1)
print classifier.classify(feats2)

このコードは pos を 2 回出力しますが、コードの最後の 2 行を反転したかのように、neg を 2 回出力します。誰でも助けることができますか？

score 4 · Accepted Answer

変化する

features['contains(%s)' % word] = (word in document_words)

に

features[word] = (word in document)

それ以外の場合、分類子は「contains(...)」という形式の「単語」しか認識しないため、次の単語については無知です。"i love this city"

import nltk.tokenize as tokenize
import nltk
import random
random.seed(3)

def bag_of_words(words):
    return dict([word, True] for word in words)

def document_features(document): 
    features = {}
    for word in word_features:
        features[word] = (word in document)
        # features['contains(%s)' % word] = (word in document_words)
    return features

movie_reviews = nltk.corpus.movie_reviews

documents = [(set(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = all_words.keys()[:2000] 

train_set = [(document_features(d), c) for (d, c) in documents[:200]]

classifier = nltk.NaiveBayesClassifier.train(train_set)

classifier.show_most_informative_features()
for word in ('love', 'hate'):
    # No hope in passing the tests if word is not in word_features
    assert word in word_features
    print('probability {w!r} is positive: {p:.2%}'.format(
        w = word, p = classifier.prob_classify({word : True}).prob('pos')))

tests = ["i love this city",
         "i hate this city"]

for test in tests:
    words = tokenize.word_tokenize(test)
    feats = bag_of_words(words)
    print('{s} => {c}'.format(s = test, c = classifier.classify(feats)))

収量

Most Informative Features
                   worst = True              neg : pos    =     15.5 : 1.0
              ridiculous = True              neg : pos    =     11.5 : 1.0
                  batman = True              neg : pos    =      7.6 : 1.0
                   drive = True              neg : pos    =      7.6 : 1.0
                   blame = True              neg : pos    =      7.6 : 1.0
                terrible = True              neg : pos    =      6.9 : 1.0
                  rarely = True              pos : neg    =      6.4 : 1.0
                 cliches = True              neg : pos    =      6.0 : 1.0
                       $ = True              pos : neg    =      5.9 : 1.0
               perfectly = True              pos : neg    =      5.5 : 1.0
probability 'love' is positive: 61.52%
probability 'hate' is positive: 36.71%
i love this city => pos
i hate this city => neg

python - 単純ベイズ分類器エラー

1 に答える 1

Related

Reference