use私は、URLが機能しているかどうかを確認するためのPythonスクリプトに取り組んでいます。スクリプトは、URL と応答コードをログ ファイルに書き込みます。チェックを高速化するために、スレッド化とキューを使用しています。
このスクリプトは、チェックする URL の数が少ない場合はうまく機能しますが、URL の数を数百に増やすと、いくつかの URL がログ ファイルから失われます。 修正する必要があるものはありますか?私のスクリプトは
#!/usr/bin/env python
import Queue
import threading
import urllib2,urllib,sys,cx_Oracle,os
import time
from urllib2 import HTTPError, URLError
queue = Queue.Queue()
##print_queue = Queue.Queue()
class NoRedirectHandler(urllib2.HTTPRedirectHandler):
def http_error_302(self, req, fp, code, msg, headers):
infourl = urllib.addinfourl(fp, headers, req.get_full_url())
infourl.status = code
infourl.code = code
return infourl
http_error_300 = http_error_302
http_error_301 = http_error_302
http_error_303 = http_error_302
http_error_307 = http_error_302
class ThreadUrl(threading.Thread):
#Threaded Url Grab
## def __init__(self, queue, print_queue):
def __init__(self, queue,error_log):
threading.Thread.__init__(self)
self.queue = queue
## self.print_queue = print_queue
self.error_log = error_log
def do_something_with_exception(self,idx,url,error_log):
exc_type, exc_value = sys.exc_info()[:2]
## self.print_queue.put([idx,url,exc_type.__name__])
with open( error_log, 'a') as err_log_f:
err_log_f.write("{0},{1},{2}\n".format(idx,url,exc_type.__name__))
def openUrl(self,pair):
try:
idx = pair[1]
url = 'http://'+pair[2]
opener = urllib2.build_opener(NoRedirectHandler())
urllib2.install_opener(opener)
request = urllib2.Request(url)
request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 5.1; rv:13.0) Gecko/20100101 Firefox/13.0.1')
#open urls of hosts
resp = urllib2.urlopen(request, timeout=10)
## self.print_queue.put([idx,url,resp.code])
with open( self.error_log, 'a') as err_log_f:
err_log_f.write("{0},{1},{2}\n".format(idx,url,resp.code))
except:
self.do_something_with_exception(idx,url,self.error_log)
def run(self):
while True:
#grabs host from queue
pair = self.queue.get()
self.openUrl(pair)
#signals to queue job is done
self.queue.task_done()
def readUrlFromDB(queue,connect_string,column_name,table_name):
try:
connection = cx_Oracle.Connection(connect_string)
cursor = cx_Oracle.Cursor(connection)
query = 'select ' + column_name + ' from ' + table_name
cursor.execute(query)
#Count lines in the file
rows = cursor.fetchall()
total = cursor.rowcount
#Loop through returned urls
for row in rows:
#print row[1],row[2]
## url = 'http://'+row[2]
queue.put(row)
cursor.close()
connection.close()
return total
except cx_Oracle.DatabaseError, e:
print e[0].context
raise
def main():
start = time.time()
error_log = "D:\\chkWebsite_Error_Log.txt"
#Check if error_log file exists
#If exists then deletes it
if os.path.isfile(error_log):
os.remove(error_log)
#spawn a pool of threads, and pass them queue instance
for i in range(10):
t = ThreadUrl(queue,error_log)
t.setDaemon(True)
t.start()
connect_string,column_name,table_name = "user/pass@db","*","T_URL_TEST"
tn = readUrlFromDB(queue,connect_string,column_name,table_name)
#wait on the queue until everything has been processed
queue.join()
## print_queue.join()
print "Total retrived: {0}".format(tn)
print "Elapsed Time: %s" % (time.time() - start)
main()