python - 基本的なスパイダープログラムは実行されません

Question

Pythonで基本的なスパイダープログラムを構築するのに問題があります。実行しようとするとエラーが発生します。エラーは、コードの最後の7行のどこかで発生します。

#These modules do most of the work.
import sys
import urllib2
import urlparse
import htmllib, formatter
from cStringIO import StringIO


def log_stdout(msg):
    """Print msg to the screen."""
    print msg

def get_page(url, log):
    """Retrieve URL and return contents, log errors."""
    try:
        page = urllib2.urlopen(url)
    except urllib2.URLError:
        log("Error retrieving: " + url)
        return ''
    body = page.read()
    page.close()
    return body

def find_links(html):
    """Return a list links in html."""
    # We're using the parser just to get the HREFs
    writer = formatter.DumbWriter(StringIO())
    f = formatter.AbstractFormatter(writer)
    parser = htmllib.HTMLParser(f)
    parser.feed(html)
    parser.close()
    return parser.anchorlist

class Spider:

    """
    The heart of this program, finds all links within a web site.

    run() contains the main loop.
    process_page() retrieves each page and finds the links.
    """

    def __init__(self, startURL, log=None):
            #This method sets initial values
        self.URLs = set()
        self.URLs.add(startURL)
        self.include = startURL
        self._links_to_process = [startURL]
        if log is None:
            # Use log_stdout function if no log provided
            self.log = log_stdout
        else:
            self.log = log

    def run(self):
        #Processes list of URLs one at a time
        while self._links_to_process:
            url = self._links_to_process.pop()
            self.log("Retrieving: " + url)
            self.process_page(url)

    def url_in_site(self, link):
        #Checks whether the link starts with the base URL
        return link.startswith(self.include)

    def process_page(self, url):
        #Retrieves page and finds links in it
        html = get_page(url, self.log)
        for link in find_links(html):
            #Handle relative links
            link = urlparse.urljoin(url, link)
            self.log("Checking: " + link)
            # Make sure this is a new URL within current site
            if link not in self.URLs and self.url_in_site(link):
                self.URLs.add(link)
                self._links_to_process.append(link)

エラーメッセージは、このコードブロックに関連しています。

if __name__ == '__main__':
    #This code runs when script is started from command line
    startURL = sys.argv[1]
    spider = Spider(startURL)
    spider.run()
    for URL in sorted(spider.URLs):
            print URL


The error message:
        startURL = sys.argv[1]
    IndexError: list index out of range

score 4 · Accepted Answer

引数を使用してスパイダープログラムを呼び出しているのではありません。sys.argv[0]はスクリプトファイルであり、sys.argv[1]渡す最初の引数になります。「リストインデックスが範囲外」とは、引数を指定しなかったことを意味します。

python spider.py http://www.example.com（実際のURLを使用して）と呼んでみてください。

score 0 · Accepted Answer

これはあなたの質問に直接答えることはありませんが、：

私は次のように何かをします：

START_PAGE = 'http://some.url.tld'
ahrefs = lxml.html.parse(START_PAGE).getroottree('//a/@href')

次に、lmxl.htmlオブジェクトとmultiprocessリンクで使用可能なメソッドを使用します

これは「半奇形」のHTMLを処理し、BeautifulSoupライブラリをプラグインできます。

JavaScriptで生成されたリンクをたどろうとする場合でも、少し作業が必要ですが、それが人生です。

python - 基本的なスパイダープログラムは実行されません

2 に答える 2

Related

Reference