私は自分の小さな Python アプリケーションを書き、現在の Python スキルでは不十分なところまで書き直しました。まず、パーサーとして Beautiful Soup を使用するシングル スレッド アプリケーションを lxml に変更しました。スクリプトをマルチスレッドにしました。ツイストを発見しましたが、この小さなスニペットをツイストに変更できませんでした。これをここに投稿するだけなので、これを少し速くするためのより良い方向性を教えてくれるかもしれません。15 万ページを取得するには、この時点で 1 時間ほど必要です。最初に書き込もうとしたときは 3 倍遅くなったので、これには満足しています。
#! /usr/bin/python
# coding: ISO-8859-1
import time, PySQLPool, Queue, threading
from urllib3 import connection_from_url
from lxml import etree
import cStringIO as StringIO
headers = {
'User-Agent' : 'Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)',
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language' : 'en-us;q=0.5,en;q=0.3',
'Accept-Encoding' : 'gzip, deflate',
'Accept-Charset' : 'utf-8;q=0.7,*;q=0.7'
}
t = time.time()
PySQLPool.getNewPool().maxActiveConnections = 60
db = PySQLPool.getNewConnection(username='user', password='pass', host='127.0.0.1', db='fddb')
pool = connection_from_url('http://fddb.info/', maxsize=60, timeout=150, headers=headers)
detailCounter = 0
urls = {}
queue = Queue.Queue()
out_queue = Queue.Queue()
clean_rows = {
"Brennwert":"details_brennwert",
"Kalorien":"details_kalorien",
"Protein":"details_protein",
"Kohlenhydrate":"details_kohlenhydrate",
"davon Zucker":"details_zucker",
"davon Polyole":"details_polyole",
"Fett":"details_fett",
"Ballaststoffe":"details_ballaststoffe",
"Broteinheiten":"details_broteinheit",
"Alkohol":"details_alkohol",
"Cholesterin":"details_cholesterin",
"Koffein":"details_koffein",
"Wassergehalt":"details_wasser",
"Vitamin C":"details_vitc",
"Vitamin A":"details_vita",
"Vitamin D":"details_vitd",
"Vitamin E":"details_vite",
"Vitamin B1":"details_vitb1",
"Vitamin B2":"details_vitb2",
"Vitamin B6":"details_vitb6",
"Vitamin B12":"details_vitb12",
"Natrium":"details_natrium",
"Eisen":"details_eisen",
"Zink":"details_zink",
"Magnesium":"details_magnesium",
"Chlor":"details_chlor",
"Mangan":"details_mangan",
"Schwefel":"details_schwefel",
"Kalium":"details_kalium",
"Kalzium":"details_kalzium",
"Phosphor":"details_phosphor",
"Kupfer":"details_kupfer",
"Fluor":"details_fluor"
}
def rows_escape(text):
for item, key in clean_rows.items():
text = text.replace(item, key)
text = text.rstrip()
return text
clean_values = {
"kJ" :"",
"kcal" :"",
"g" :"",
"mg" :"",
"%" :"",
"," :".",
u"\u03bc": ""
}
def values_escape(text):
for item, key in clean_values.items():
text = text.replace(item, key)
text = text.rstrip()
return text
def insertDetails(container, foods_id):
c = PySQLPool.getNewQuery(db)
query_rows = ''
query_values = ''
for item in container:
query_rows += item['row'] + ','
query_values += item['value'] + ','
c.Query("INSERT INTO details (%sdetails_id,foods_id) VALUES (%sNULL,%s)" % (query_rows, query_values, foods_id))
c.Query("UPDATE foods SET foods_check = '1' WHERE foods_id=%d" % (foods_id))
def getHP(url):
r = pool.request('GET', '/' + url)
return r.data
class ThreadUrl(threading.Thread):
def __init__(self, queue, out_queue):
threading.Thread.__init__(self)
self.queue = queue
self.out_queue = out_queue
def run(self):
while True:
host = self.queue.get()
data = getHP(host[0])
self.out_queue.put([data, host[1]])
self.queue.task_done()
class DatamineThread(threading.Thread):
def __init__(self, out_queue):
threading.Thread.__init__(self)
self.out_queue = out_queue
def run(self):
while True:
global detailCounter
qData = self.out_queue.get()
data = qData[0]
foods_id = qData[1]
container = []
parser = etree.HTMLParser(encoding='cp1252')
tree = etree.parse(StringIO.StringIO(data), parser)
divx = tree.xpath('//div[@style="background-color:#f0f5f9;padding:2px 4px;" or @style="padding:2px 4px;"]')
for xdiv in divx:
x = etree.ElementTree(element=xdiv, parser=parser)
value = x.xpath('string(//div/text())')
label = x.xpath('string(//*[self::a or self::span]/text())')
label = rows_escape(label)
if not "[nodata]" in value:
if u"\u03bc" in value:
value = values_escape(value)
item4 = 0
item4 = float(value)
item4 = item4 / 1000
container.append({'row':label,'value':str(item4)})
else:
container.append({'row':label,'value':values_escape(value)})
detailCounter += 1
container = tuple(container)
insertDetails(container, foods_id)
self.out_queue.task_done()
def main():
c = PySQLPool.getNewQuery(db)
c.Query("SELECT foods_id, foods_url FROM foods WHERE foods_check = 0")
urls = c.record
for i in range(6):
t = ThreadUrl(queue, out_queue)
t.setDaemon(True)
t.start()
for item in urls:
queue.put([item['foods_url'], item['foods_id']])
for i in range(6):
dt = DatamineThread(out_queue)
dt.setDaemon(True)
dt.start()
queue.join()
out_queue.join()
main()
db.close
print "Zeit: %.2f New Details: %d" % (time.time()-t, detailCounter)