ここでは、python と美しいスープを使用して、そのページのすべてのリンクをリンクのリポジトリに解析するコードを記述しました。次に、作成したばかりのリポジトリから任意の URL のコンテンツを取得し、この新しいコンテンツからリポジトリへのリンクを解析して、停止するか、指定された数のリンクが取得されるまで、リポジトリ内のすべてのリンクに対してこのプロセスを続けます。
しかし、このコードは非常に遅いです。Pythonでgeventsを使用した非同期プログラミングを使用して改善するにはどうすればよいですか?
コード
class Crawler(object):
def __init__(self):
    self.soup = None                                        # Beautiful Soup object
    self.current_page   = "http://www.python.org/"          # Current page's address
    self.links          = set()                             # Queue with every links fetched
    self.visited_links  = set()
    self.counter = 0 # Simple counter for debug purpose
def open(self):
    # Open url
    print self.counter , ":", self.current_page
    res = urllib2.urlopen(self.current_page)
    html_code = res.read()
    self.visited_links.add(self.current_page) 
    # Fetch every links
    self.soup = BeautifulSoup.BeautifulSoup(html_code)
    page_links = []
    try :
        page_links = itertools.ifilter(  # Only deal with absolute links 
                                        lambda href: 'http://' in href,
                                            ( a.get('href') for a in self.soup.findAll('a') )  )
    except Exception as e: # Magnificent exception handling
        print 'Error: ',e
        pass
    # Update links 
    self.links = self.links.union( set(page_links) ) 
    # Choose a random url from non-visited set
    self.current_page = random.sample( self.links.difference(self.visited_links),1)[0]
    self.counter+=1
def run(self):
    # Crawl 3 webpages (or stop if all url has been fetched)
    while len(self.visited_links) < 3 or (self.visited_links == self.links):
        self.open()
    for link in self.links:
        print link
if __name__ == '__main__':
C = Crawler()
C.run()
更新 1
import gevent.monkey; gevent.monkey.patch_thread()
from bs4 import BeautifulSoup
import urllib2
import itertools
import random
import urlparse
import sys
import gevent.monkey; gevent.monkey.patch_all(thread=False)
class Crawler(object):
def __init__(self):
self.soup = None                                        # Beautiful Soup object
self.current_page   = "http://www.python.org/"          # Current page's address
self.links          = set()                             # Queue with every links fetched
self.visited_links  = set()
self.counter = 0 # Simple counter for debug purpose
def open(self):
# Open url
print self.counter , ":", self.current_page
res = urllib2.urlopen(self.current_page)
html_code = res.read()
self.visited_links.add(self.current_page)
# Fetch every links
self.soup = BeautifulSoup(html_code)
page_links = []
try :
    for link in [h.get('href') for h in self.soup.find_all('a')]:
        print "Found link: '" + link + "'"
        if link.startswith('http'):
    print 'entered in if link: ',link
            page_links.append(link)
            print "Adding link" + link + "\n"
        elif link.startswith('/'):
    print 'entered in elif link: ',link
            parts = urlparse.urlparse(self.current_page)
            page_links.append(parts.scheme + '://' + parts.netloc + link)
            print "Adding link " + parts.scheme + '://' + parts.netloc + link + "\n"
        else:
    print 'entered in else link: ',link
            page_links.append(self.current_page+link)
            print "Adding link " + self.current_page+link + "\n"
except Exception, ex: # Magnificent exception handling
    print ex
# Update links 
self.links = self.links.union( set(page_links) )
# Choose a random url from non-visited set
self.current_page = random.sample( self.links.difference(self.visited_links),1)[0]
self.counter+=1
def run(self):
# Crawl 3 webpages (or stop if all url has been fetched)
crawling_greenlets = []
for i in range(3):
  crawling_greenlets.append(gevent.spawn(self.open))
gevent.joinall(crawling_greenlets)
#while len(self.visited_links) < 4 or (self.visited_links == self.links):
#    self.open()
for link in self.links:
  print link
if __name__ == '__main__':
C = Crawler()
C.run()