python - Urllib2 でより効率的にスクレイピングするには?

Question

ここの初心者。私は urllib2 を使用して Billboard.com を調べ、1958 年から 2013 年までの各週のトップソングとアーティストをスクレイピングする簡単なスクリプトを作成しました。

ボトルネックがどこにあるのか、Urllib2 でより効率的にスクレイピングする方法があるのか、それともより洗練されたツールを使用する必要があるのか疑問に思っています。

import re
import urllib2
array = []
url = 'http://www.billboard.com/charts/1958-08-09/hot-100'
date = ""
while date != '2013-07-13':
    response = urllib2.urlopen(url)
    htmlText = response.read()
    date = re.findall('\d\d\d\d-\d\d-\d\d',url)[0]
    song = re.findall('<h1>.*</h1>', htmlText)[0]
    song = song[4:-5]
    artist = re.findall('/artist.*</a>', htmlText)[1]
    artist = re.findall('>.*<', artist)[0]
    artist = artist[1:-1]
    nextWeek = re.findall('href.*>Next', htmlText)[0]
    nextWeek = nextWeek[5:-5]
    array.append([date, song, artist])
    url = 'http://www.billboard.com' + nextWeek
print array

score 3 · Accepted Answer

あなたのボトルネックはほぼ間違いなく、Web サイトからデータを取得することです。各ネットワークリクエストにはレイテンシがあり、その間に他の処理がブロックされます。一度に複数のリクエストを送信できるように、リクエストを複数のスレッドに分割することを検討する必要があります。基本的に、ここでのパフォーマンスは I/O バウンドであり、CPU バウンドではありません。

これは、クローラーが一般的にどのように機能するかを確認できるように、ゼロから構築された単純なソリューションです。長期的には Scrapy のようなものを使用するのが最善かもしれませんが、シンプルで明示的なものから始めるのが常に役立つことがわかりました。

import threading
import Queue
import time
import datetime
import urllib2
import re

class Crawler(threading.Thread):
    def __init__(self, thread_id, queue):
        threading.Thread.__init__(self)
        self.thread_id = thread_id
        self.queue = queue

        # let's use threading events to tell the thread when to exit
        self.stop_request = threading.Event()

    # this is the function which will run when the thread is started
    def run(self):
        print 'Hello from thread %d! Starting crawling...' % self.thread_id

        while not self.stop_request.isSet():
            # main crawl loop

            try:
                # attempt to get a url target from the queue
                url = self.queue.get_nowait()
            except Queue.Empty:
                # if there's nothing on the queue, sleep and continue
                time.sleep(0.01)
                continue

            # we got a url, so let's scrape it!
            response = urllib2.urlopen(url) # might want to consider adding a timeout here
            htmlText = response.read()

            # scraping with regex blows.
            # consider using xpath after parsing the html using lxml.html module
            song = re.findall('<h1>.*</h1>', htmlText)[0]
            song = song[4:-5]
            artist = re.findall('/artist.*</a>', htmlText)[1]
            artist = re.findall('>.*<', artist)[0]
            artist = artist[1:-1]

            print 'thread %d found artist:', (self.thread_id, artist)

    # we're overriding the default join function for the thread so
    # that we can make sure it stops
    def join(self, timeout=None):
        self.stop_request.set()
        super(Crawler, self).join(timeout)

if __name__ == '__main__':
    # how many threads do you want?  more is faster, but too many
    # might get your IP blocked or even bring down the site (DoS attack)
    n_threads = 10

    # use a standard queue object (thread-safe) for communication
    queue = Queue.Queue()

    # create our threads
    threads = []
    for i in range(n_threads):
        threads.append(Crawler(i, queue))

    # generate the urls and fill the queue
    url_template = 'http://www.billboard.com/charts/%s/hot-100'
    start_date = datetime.datetime(year=1958, month=8, day=9)
    end_date = datetime.datetime(year=1959, month=9, day=5)
    delta = datetime.timedelta(weeks=1)

    week = 0
    date = start_date + delta*week
    while date <= end_date:
        url = url_template % date.strftime('%Y-%m-%d')
        queue.put(url)
        week += 1
        date = start_date + delta*week

    # start crawling!
    for t in threads:
        t.start()

    # wait until the queue is empty
    while not queue.empty():
        time.sleep(0.01)

    # kill the threads
    for t in threads:
        t.join()

python - Urllib2 でより効率的にスクレイピングするには?

2 に答える 2

Related

Reference