0

これは、特定のリンクで利用可能なすべてのURLを出力するクローラーです。

#!C:/Python27/python.exe -u
import urllib
import cgi,cgitb
cgitb.enable()

print "Content-Type: text/html\n\n"

def get_page(url):
    try:
        return urllib.urlopen(url).read()
except:
    return ""

def get_next_target(page):
    start_link = page.find('<a href=')
    if start_link == -1: 
        return None, 0
    start_quote = page.find('"', start_link)
    end_quote = page.find('"', start_quote + 1)
    url = page[start_quote + 1:end_quote]
    return url, end_quote

def get_all_links(page):
    links = []
    while True:
        url, endpos = get_next_target(page)
        if url:
            links.append(url)
            page = page[endpos:]
        else:
            break
    return links

def union(a, b):
    for e in b:
        if e not in a:
            a.append(e)

def add_page_to_index(index, url, content):
    words = content.split()
    for word in words:
        add_to_index(index, word, url)

def add_to_index(index, keyword, url):
    if keyword in index:
        index[keyword].append(url)
    else:
        index[keyword] = [url]

def lookup(index, keyword):
    if keyword in index:
        return index[keyword]
    else:
        return None

def crawl_web(seed): # returns index, graph of inlinks
    tocrawl = [seed]
    crawled = []
    graph = {}  # <url>, [list of pages it links to]
    index = {} 
    while tocrawl: 
        page = tocrawl.pop()
        if page not in crawled:
            content = get_page(page)
            add_page_to_index(index, page, content)
            outlinks = get_all_links(content)
            graph[page] = outlinks
            union(tocrawl, outlinks)
            crawled.append(page)
    return index, graph

index, graph = crawl_web('http://www.bing.com/results.asp?q=fulcrum')

print graph
print """
<html>
<body>
Animesh Pandey
</body>
</html>
"""
print "<br>"
print graph
print "<br>"
print index
print "<br>"
print tocrawl
print "<br>"
print seed

このPythonファイルはオンラインインタプリタで正常に動作します!少なくともそれはいくつかの結果をもたらします....しかし、ブラウザで実行すると、常にタイムアウトが発生します!! Apache2.2.11とPython2.7.3を使用しています。

何をしたらいいのか教えてください???

4

1 に答える 1

0

これは正しいバージョンのコードです。

#!C:/Python27/python.exe -u
import urllib
import cgi,cgitb
cgitb.enable()

print "Content-Type: text/html\n\n"

def get_page(url):
    try:
        return urllib.urlopen(url).read()
except:
    return ""

def get_next_target(page):
    start_link = page.find('<a href=')
    if start_link == -1: 
        return None, 0
    start_quote = page.find('"', start_link)
    end_quote = page.find('"', start_quote + 1)
    url = page[start_quote + 1:end_quote]
    return url, end_quote

def get_all_links(page):
    links = []
    while True:
        url, endpos = get_next_target(page)
        if url:
            links.append(url)
            page = page[endpos:]
        else:
            break
    return links

def union(a, b):
    for e in b:
        if e not in a:
            a.append(e)

def add_page_to_index(index, url, content):
    words = content.split()
    for word in words:
        add_to_index(index, word, url)

def add_to_index(index, keyword, url):
    if keyword in index:
        index[keyword].append(url)
    else:
        index[keyword] = [url]

def lookup(index, keyword):
    if keyword in index:
        return index[keyword]
    else:
        return None

def crawl_web(seed): # returns index, graph of inlinks
    tocrawl = [seed]
    crawled = []
    graph = {}  # <url>, [list of pages it links to]
    index = {} 
    while tocrawl: 
        page = tocrawl.pop()
        if page not in crawled:
            content = get_page(page)
            add_page_to_index(index, page, content)
            outlinks = get_all_links(content)
            graph[page] = outlinks
            union(tocrawl, outlinks)
            crawled.append(page)
    return index, graph

theurl = "http://dl.dropbox.com/u/86814352/ani.html"
index, graph = crawl_web(theurl)

print "<br>"
print graph
print "<br>"

これにより、その静的WebページにURLが出力されますが、このコードは多くのリンクがあるサイトには適していません。

于 2012-08-14T14:39:34.657 に答える