インポートされた CSV ファイル (~500MB) から 4 つの列を抽出して、scikit-learn
回帰モデルのフィッティングに使用しています。
抽出を行うために使用されるこの関数は非常に遅いようです。今日Pythonを学んだばかりですが、関数を高速化する方法について何か提案はありますか?
マルチスレッド/コアは使用できますか? 私のシステムには 4 つのコアがあります。
def splitData(jobs):
salaries = [jobs[i]['salaryNormalized'] for i, v in enumerate(jobs)]
descriptions = [jobs[i]['description'] + jobs[i]['normalizedLocation'] + jobs[i]['category'] for i, v in enumerate(jobs)]
titles = [jobs[i]['title'] for i, v in enumerate(jobs)]
return salaries, descriptions, titles
印刷タイプ(ジョブ)
<type 'list'>
印刷ジョブ[:1]
[{'category': 'Engineering Jobs', 'salaryRaw': '20000 - 30000/annum 20-30K', 'rawLocation': 'Dorking, Surrey, Surrey', 'description': 'Engineering Systems Analyst Dorking Surrey Salary ****K Our client is located in Dorking, Surrey and are looking for Engineering Systems Analyst our client provides specialist software development Keywords Mathematical Modelling, Risk Analysis, System Modelling, Optimisation, MISER, PIONEEER Engineering Systems Analyst Dorking Surrey Salary ****K', 'title': 'Engineering Systems Analyst', 'sourceName': 'cv-library.co.uk', 'company': 'Gregory Martin International', 'contractTime': 'permanent', 'normalizedLocation': 'Dorking', 'contractType': '', 'id': '12612628', 'salaryNormalized': '25000'}]
def loadData(filePath):
reader = csv.reader( open(filePath) )
rows = []
for i, row in enumerate(reader):
categories = ["id", "title", "description", "rawLocation", "normalizedLocation",
"contractType", "contractTime", "company", "category",
"salaryRaw", "salaryNormalized","sourceName"]
# Skip header row
if i != 0:
rows.append( dict(zip(categories, row)) )
return rows
def splitData(jobs):
salaries = []
descriptions = []
titles = []
for i in xrange(len(jobs)):
salaries.append( jobs[i]['salaryNormalized'] )
descriptions.append( jobs[i]['description'] + jobs[i]['normalizedLocation'] + jobs[i]['category'] )
titles.append( jobs[i]['title'] )
return salaries, descriptions, titles
def fit(salaries, descriptions, titles):
#Vectorize
vect = TfidfVectorizer()
vect2 = TfidfVectorizer()
descriptions = vect.fit_transform(descriptions)
titles = vect2.fit_transform(titles)
#Fit
X = hstack((descriptions, titles))
y = [ np.log(float(salaries[i])) for i, v in enumerate(salaries) ]
rr = Ridge(alpha=0.035)
rr.fit(X, y)
return vect, vect2, rr, X, y
jobs = loadData( paths['train_data_path'] )
salaries, descriptions, titles = splitData(jobs)
vect, vect2, rr, X_train, y_train = fit(salaries, descriptions, titles)