お時間をいただき申し訳ありませんが、私は本当にブロックされています!
私は Python がまったく得意ではありませんが、学習に一生懸命取り組んでおり、このスクリプトを実行しようとしています。スレッドがなくても機能しますが、Python のスキルを習得して向上させるために、これの何が問題なのかを理解したいと思いました。
問題: - スクリプトが終わらない - 何も解析していない ... urlopen が正しく動作していないようだ
助けてくれてありがとう、私はまだ試しています:-)
import Queue
import threading
import urllib2
from urllib2 import urlopen
import time
from bs4 import BeautifulSoup as BeautifulSoup
import xlwt
import time
import socket
socket.setdefaulttimeout(20.0)
class Retry(object):
default_exceptions = (Exception,)
def __init__(self, tries, exceptions=None, delay=0):
"""
Decorator for retrying a function if exception occurs
tries -- num tries
exceptions -- exceptions to catch
delay -- wait between retries
"""
self.tries = tries
if exceptions is None:
exceptions = Retry.default_exceptions
self.exceptions = exceptions
self.delay = delay
def __call__(self, f):
def fn(*args, **kwargs):
exception = None
for _ in range(self.tries):
try:
return f(*args, **kwargs)
except self.exceptions, e:
print "Retry, exception: "+str(e)
time.sleep(self.delay)
exception = e
#if no success after tries, raise last exception
raise exception
return fn
@Retry(5)
def open_url(source):
print("OPENING %s" % source)
print("Retrying to open and read the page")
resp = urlopen(source)
resp = resp.read()
return resp
queue = Queue.Queue()
out_queue = Queue.Queue()
class ThreadUrl(threading.Thread):
"""Threaded Url Grab"""
def __init__(self, queue, out_queue):
threading.Thread.__init__(self)
self.queue = queue
self.out_queue = out_queue
def run(self):
while True:
#grabs host from queue
host = self.queue.get()
#grabs urls of hosts and then grabs chunk of webpage
chunk = open_url(host)
#chunk = url.read()
#place chunk into out queue
self.out_queue.put(chunk)
#signals to queue job is done
self.queue.task_done()
class DatamineThread(threading.Thread):
"""Threaded Url Grab"""
def __init__(self, out_queue):
threading.Thread.__init__(self)
self.out_queue = out_queue
def run(self):
global x
while True:
#grabs host from queue
chunk = self.out_queue.get()
#parse the chunk
soup = BeautifulSoup(chunk)
#print soup
tableau = soup.findAll('table')
rows = tableau[1].findAll('tr')
print("DONE")
for tr in rows:
cols = tr.findAll('td')
y = 0
x = x + 1
for td in cols:
texte_bu = td.text
texte_bu = texte_bu.encode('utf-8')
print texte_bu
ws.write(x,y,td.text)
y = y + 1
wb.save("IA.xls")
#signals to queue job is done
self.out_queue.task_done()
break
start = time.time()
def main():
#spawn a pool of threads, and pass them queue instance
for i in range(13):
t = ThreadUrl(queue, out_queue)
t.setDaemon(True)
t.start()
#populate queue with data
for host in hosts:
queue.put(host)
for i in range(1):
dt = DatamineThread(out_queue)
dt.setDaemon(True)
dt.start()
#wait on the queue until everything has been processed
queue.join()
out_queue.join()
global x
x = 0
wb = xlwt.Workbook(encoding='utf-8')
ws = wb.add_sheet("BULATS_IA_PARSED")
Countries_List = ['Afghanistan','Armenia','Brazil','Argentina','Armenia','Australia','Austria','Azerbaijan','Bahrain','Bangladesh','Belgium','Belize','Bolivia','Bosnia and Herzegovina','Brazil','Brunei Darussalam','Bulgaria','Cameroon','Canada','Central African Republic','Chile','China','Colombia','Costa Rica','Croatia','Cuba','Cyprus','Czech Republic','Denmark','Dominican Republic','Ecuador','Egypt','Eritrea','Estonia','Ethiopia','Faroe Islands','Fiji','Finland','France','French Polynesia','Georgia','Germany','Gibraltar','Greece','Grenada','Hong Kong','Hungary','Iceland','India','Indonesia','Iran','Iraq','Ireland','Israel','Italy','Jamaica','Japan','Jordan','Kazakhstan','Kenya','Kuwait','Latvia','Lebanon','Libya','Liechtenstein','Lithuania','Luxembourg','Macau','Macedonia','Malaysia','Maldives','Malta','Mexico','Monaco','Montenegro','Morocco','Mozambique','Myanmar (Burma)','Nepal','Netherlands','New Caledonia','New Zealand','Nigeria','Norway','Oman','Pakistan','Palestine','Papua New Guinea','Paraguay','Peru','Philippines','Poland','Portugal','Qatar','Romania','Russia','Saudi Arabia','Serbia','Singapore','Slovakia','Slovenia','South Africa','South Korea','Spain','Sri Lanka','Sweden','Switzerland','Syria','Taiwan','Thailand','Trinadad and Tobago','Tunisia','Turkey','Ukraine','United Arab Emirates','United Kingdom','United States','Uruguay','Uzbekistan','Venezuela','Vietnam']
hosts = ["http://www.cambridgeesol.org/institutions/results.php?region=%s&type=&BULATS=on" % Countries for Countries in Countries_List]
main()
print "Elapsed Time: %s" % (time.time() - start)
PS: また、urllib3 (keep-connexion) がこの場合に役立つと思いますか? また、誰がこれを実装するのか説明してもらえますか?