URL のリストから title と meta_description を抽出するタスクが与えられました。グースを使用しました。私はそれを正しくやっていますか?
from goose import Goose import urlparse import numpy as np import os import pandas
os.chdir("C:\Users\EDAWES01\Desktop\Cookie profiling")
data = pandas.read_csv('activity_url.csv', delimiter=';')
data_read=np.array(data)
quantity = data_read[0:, 2]
url_data = data_read[quantity==1][0:3,1]
user_id = data_read[quantity==1][0:3,0]
url_data
#remove '~oref='
clean_url_data=[] #intialize
for i in xrange(0,len(url_data)):
clean_url_data.append(i)
clean_url_data[i]=urlparse.urlparse(url_data[i])[2].split("=")
clean_url_data[i]=clean_url_data[i][1]
clean_url_data=np.array([clean_url_data])
#store title
website_title=[]
#store meta_description
website_meta_description=[]
g=Goose()
for urlt in xrange(0, len(clean_url_data)):
website_title.append(urlt)
website_title[urlt]=g.extract(clean_url_data[urlt])
website_title[urlt]=website_title[urlt].title
website_title=np.array([website_title])
for urlw in xrange(0, len(clean_url_data)):
website_meta_description.append(urlw)
website_meta_description[urlw]=g.extract(clean_url_data[urlw])
website_meta_description[urlw]=website_meta_description[urlw].meta_description
website_meta_desciption=np.array([website_meta_description])