ロシア語のテキストを分類する必要があるテキスト分類の問題に直面しました。特徴抽出のために、私は scikit Learn TfidfTransformer と CountVectorizer を使用しますが、コードをコンパイルした後に間違いがあります。
'UnicodeDecodeError: 'utf8' codec can't decode byte 0xc2 in position 0:
invalid continuation byte'.
この間違いを修正するにはどうすればよいですか? Python のコードは次のとおりです。
# -*- coding: utf-8 -*-
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
import numpy as np
import numpy.linalg as LA
import os
import nltk
import re
import sys
from nltk import NaiveBayesClassifier
import nltk.classify
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
import re
data_path = os.path.abspath(os.path.join('/home/lena/','corpus'))
official_path = os.path.join(data_path,'official')
#print official_path
official2_path = os.path.join(data_path,'official_2')
talk_path = os.path.join(data_path,'talk')
talk2_path = os.path.join(data_path,'talk_2')
#fiction_path = os.path.join(data_path,'fiction')
#fiction2_path = os.path.join(data_path,'fiction_2')
def get_text(path):
with open(path,'rU') as file:
line = file.readlines()
return ''.join(line)
def get_textdir(path):
filelist = os.listdir(path)
all_text = [get_text(os.path.join(path,f)) for f in filelist]
return all_text
all_talk = get_textdir(talk_path)
all_official = get_textdir(official_path)
official_2 = get_textdir(official2_path)
talk_2 = get_textdir(talk2_path)
train_set = all_talk
test_set = talk_2
stopWords = stopwords.words('russian')
vectorizer = CountVectorizer(stop_words = stopWords)
print vectorizer
train = vectorizer.fit_transform(train_set).toarray()
test = vectorizer.transform(test_set).toarray()
print 'train set', train
print 'test set', test
transformer.fit(train)
print transformer.transform(train).toarray()
transformer.fit(test)
tfidf = transformer.transform(test)
print tfidf.todense()