multithreading - Python マルチスレッドの欠落データ

Question

use私は、URLが機能しているかどうかを確認するためのPythonスクリプトに取り組んでいます。スクリプトは、URL と応答コードをログファイルに書き込みます。チェックを高速化するために、スレッド化とキューを使用しています。

このスクリプトは、チェックする URL の数が少ない場合はうまく機能しますが、URL の数を数百に増やすと、いくつかの URL がログファイルから失われます。

修正する必要があるものはありますか?

私のスクリプトは

#!/usr/bin/env python
import Queue
import threading
import urllib2,urllib,sys,cx_Oracle,os
import time
from urllib2 import HTTPError, URLError


queue = Queue.Queue()
##print_queue = Queue.Queue()

class NoRedirectHandler(urllib2.HTTPRedirectHandler):
    def http_error_302(self, req, fp, code, msg, headers):
        infourl = urllib.addinfourl(fp, headers, req.get_full_url())
        infourl.status = code
        infourl.code = code
        return infourl
    http_error_300 = http_error_302
    http_error_301 = http_error_302
    http_error_303 = http_error_302
    http_error_307 = http_error_302

class ThreadUrl(threading.Thread):
    #Threaded Url Grab
##    def __init__(self, queue, print_queue):
    def __init__(self, queue,error_log):    
        threading.Thread.__init__(self)
        self.queue = queue
##        self.print_queue = print_queue
        self.error_log = error_log

    def do_something_with_exception(self,idx,url,error_log):
        exc_type, exc_value = sys.exc_info()[:2]
##        self.print_queue.put([idx,url,exc_type.__name__])
        with open( error_log, 'a') as err_log_f:
            err_log_f.write("{0},{1},{2}\n".format(idx,url,exc_type.__name__))


    def openUrl(self,pair):
        try:
            idx = pair[1]
            url = 'http://'+pair[2]

            opener = urllib2.build_opener(NoRedirectHandler())
            urllib2.install_opener(opener)
            request = urllib2.Request(url)
            request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 5.1; rv:13.0) Gecko/20100101 Firefox/13.0.1')


            #open urls of hosts 
            resp = urllib2.urlopen(request, timeout=10)

##            self.print_queue.put([idx,url,resp.code])
            with open( self.error_log, 'a') as err_log_f:
                err_log_f.write("{0},{1},{2}\n".format(idx,url,resp.code))


        except:
            self.do_something_with_exception(idx,url,self.error_log)    


    def run(self):
        while True:
            #grabs host from queue
            pair = self.queue.get()
            self.openUrl(pair)

            #signals to queue job is done
            self.queue.task_done()

def readUrlFromDB(queue,connect_string,column_name,table_name):
    try:  
        connection = cx_Oracle.Connection(connect_string)
        cursor = cx_Oracle.Cursor(connection)
        query = 'select ' + column_name + ' from ' + table_name
        cursor.execute(query)

        #Count lines in the file
        rows = cursor.fetchall()
        total = cursor.rowcount        

        #Loop through returned urls
        for row in rows:
            #print row[1],row[2]
##            url = 'http://'+row[2]
            queue.put(row)
        cursor.close()
        connection.close()

        return total

    except cx_Oracle.DatabaseError, e:
        print e[0].context
        raise   

def main():   
    start = time.time()
    error_log = "D:\\chkWebsite_Error_Log.txt"

    #Check if error_log file exists
    #If exists then deletes it
    if os.path.isfile(error_log):  
         os.remove(error_log)

    #spawn a pool of threads, and pass them queue instance 
    for i in range(10):
        t = ThreadUrl(queue,error_log)
        t.setDaemon(True)
        t.start()

    connect_string,column_name,table_name = "user/pass@db","*","T_URL_TEST"
    tn = readUrlFromDB(queue,connect_string,column_name,table_name)


   #wait on the queue until everything has been processed     
    queue.join()
##    print_queue.join()

    print "Total retrived: {0}".format(tn)
    print "Elapsed Time: %s" % (time.time() - start)

main()

score 1 · Accepted Answer

Pythonのスレッドモジュールは、グローバルインタープリターロック（http://wiki.python.org/moin/GlobalInterpreterLock ）のため、実際にはマルチスレッドではありません。そのため、実際にはmultiprocessing http://docs.python.org/library/multiprocessing.htmlを使用する必要があります。あなたは本当に複数のコアを利用したいと思っています。

また、同時にファイルにアクセスしているようです

with open( self.error_log, 'a') as err_log_f:
    err_log_f.write("{0},{1},{2}\n".format(idx,url,resp.code))

これは本当に悪いAFAIKです。2つのスレッドが同じファイルに同時にまたはほぼ同時に書き込もうとしている場合、それらは実際にはマルチスレッドではないことに注意してください。動作は定義されていない傾向があります。閉じたばかり...

とにかく、ファイルへの書き込みを処理するために3番目のキューが必要になります。

score 0 · Accepted Answer

多くのスレッドが同時にログファイルに書き込もうとしているため、一見すると競合状態のように見えます。書き込み用にファイルをロックする方法については、この質問を参照してください(一度に 1 つのスレッドしかアクセスできません)。

multithreading - Python マルチスレッドの欠落データ

2 に答える 2

Related

Reference