python - ドキュメントの分類方法を改善する

Question

ニュース記事が特定のトピックに関するものかどうかを予測するプログラムがあります。

主なスクリプトは 2 つあります。

1) bow_train.py - 単語リストとモデルを生成し、2 つのファイル (arab.model と wordList.pkl) に保存します。

2) bow_predict.py - 単語リストとモデルを使用して未知の記事を分類します

使用される方法は、サポートベクターマシンの代わりにロジスティック回帰です。これは、この種の分類に対するロジスティック回帰のパフォーマンスがはるかに優れていると考えられるためです。

成績を上げたい。特定のキーワードを強調できる別の方法はありますか。たとえば、トピック「アラブの春」の場合、[「抗議」、「不安」、「革命」などのキーワードのリストを入力すると、これらのキーワードを含むファイルは、含まないものよりも確率が高くなります。

bow_predict.py

import re
import os
import sys
import pickle
import operator

from collections import Counter

from liblinearutil import *

from bow_util import *

# path to directory with articles that should be classified
rootdirAll = 'C:\\Users\\Jiyda\\Desktop\\bow_arab\\all\\'

# load the wordList and model from the training phase
wordListIn = open('wordList.pkl', 'rb')

m        = load_model('arab.model')
wordList = pickle.load(wordListIn)

counterByFilepathAll = {}

# count and store term frequencies
for folder, subs, files in os.walk(rootdirAll):
    for filename in files:
        filepath       = os.path.join(folder, filename)
        wordsInArticle = get_words_from_file(filepath)
        counterByFilepathAll[filepath] = count_words(wordsInArticle)

denseData = []

# generate features from term frequencies (bag-of-words)
for _, counter in counterByFilepathAll.iteritems():
    denseData.append(gen_features(counter, wordList))

# assume output class is 1 (liblinear/libsvm always require a output class
# even for unknown data)
classList = [1 for _ in xrange(0, len(counterByFilepathAll))]

# predict using the model from training phase
y, x                  = classList, denseData
p_label, p_acc, p_val = predict(y, x, m)

# store probabilites by filepath
probByFilepath = {}
i = 0
for filepath, _ in counterByFilepathAll.iteritems():
    probByFilepath[filepath] = p_val[i]
    i += 1

# sort by probability
sortedByProb = sorted(probByFilepath.iteritems(),
                      key=operator.itemgetter(1),
                      reverse=True)

# write to output file         
probsOut = open('probsOut.txt', 'wb')
for t in sortedByProb:
    probsOut.write(' '.join(str(s) for s in t) + '\n')

probsOut.close()

bow_train.py

import re
import os
import sys
import copy
import pickle

from collections import defaultdict
from collections import Counter

from liblinearutil import *

from bow_util import *

# Initialize directories for articles

rootdirArab = sys.argv[1]
rootdirNoArab = sys.argv[2]

#rootdirArab   = 'C:\\Users\\Jiyda\\Desktop\\bow_arab\\arab\\'
#rootdirNoArab = 'C:\\Users\\Jiyda\\Desktop\\bow_arab\\no_arab\\'

wordSet                 = set()
counterByFilepathArab   = {}
counterByFilepathNoArab = {}

# generate set of all words in all articles
for rootdir in [rootdirArab, rootdirNoArab]:
    for folder, subs, files in os.walk(rootdir):
        for filename in files:
            filepath       = os.path.join(folder, filename)
            wordsInArticle = get_words_from_file(filepath)
            wordSet        = wordSet.union(wordSet, wordsInArticle)

# store sorted set in list
wordList = sorted(wordSet)

# save sorted list to output file for prediction phase
wordListOut = open('wordList.pkl', 'wb')
pickle.dump(wordList, wordListOut)

# count and store term frequencies for all arab spring training articles
for folder, subs, files in os.walk(rootdirArab):
    for filename in files:
        filepath       = os.path.join(folder, filename)
        wordsInArticle = get_words_from_file(filepath)
        counterByFilepathArab[filepath] = count_words(wordsInArticle)

# count and store term frequencies for all non arab spring training articles
for folder, subs, files in os.walk(rootdirNoArab):
    for filename in files:
        filepath       = os.path.join(folder, filename)
        wordsInArticle = get_words_from_file(filepath)
        counterByFilepathNoArab[filepath] = count_words(wordsInArticle)

# generate features. the features for one article are a list of the frequenices 
# of each term in wordList found in the article
denseData = []

for counter in counterByFilepathArab.values():
    denseData.append(gen_features(counter, wordList))

for counter in counterByFilepathNoArab.values():
    denseData.append(gen_features(counter, wordList))

# set output value to 1 for arab spring articles and -1 for non arab spring articles
classList = [1 for _ in xrange(0, len(counterByFilepathArab))] + \
            [-1 for _ in xrange(0, len(counterByFilepathNoArab))]

# train logistic regression model
y, x  = classList, denseData
prob  = problem(y, x)
# uncomment to obtain cross validation results
#param = parameter('-v 5')
m     = train(prob)#, param)

# store model in output file for prediction phase
save_model('arab.model', m)

# uncomment to check if training worked as expected
#p_label, p_acc, p_val = predict(y, x, m)
#ACC, MSE, SCC         = evaluations(y, p_label)

wordListOut.close()

python - ドキュメントの分類方法を改善する

0 に答える 0

Related

Reference