-6

ファイルがsearchengine.pyあり、このためのインデックスも作成しました。

searchengine.py:

import sqlite3
import urllib2
from bs4 import BeautifulSoup
from urlparse import urljoin

# Create a list of words to igonre
ignorewords=set(['the','of','to','and','a','in','is','it'])

class crawler:
    # Initialize the crawler with the name of database
    def __init__(self,dbname):
        self.con=sqlite3.connect(dbname)

    def __del__(self):
        self.con.close()

    def dbcommit(self):
        pass

    # Auxilliary function for getting an entry id and
    # adding it if not present
    def getentryid(self, table, field, value, createnew=True):
        cur=self.con.execute("select rowid from %s where %s='%s'" % (table,field,value))
        res=cur.fetchone()
        if res==None:
            cur=self.con.execute("insert into %s (%s) values ('%s')" % (table,field,value))
            return cur.lastrowid
        else:
            return res[0]

    # Index an individual page
    def addtoindex(self,url,soup):
        if self.isindexed(url): return
        print 'Indexing %s' %url

        # Get the individual words
        text=self.gettextonly(soup)
        words=self.separatewords(text)

        # Get the URL id
        urlid=self.getentryid('urllist','url',url)

        # Link each word to this url
        for i in range(len(words)):
            word=words[i]
            if word in ignorewords: continue
            wordid=self.getentryid('wordlist','word',word)
            self.con.execute("insert into wordlocation(urlid,wordid,location) \
                values (%d,%d,%d)" % (urlid,wordid,i))


    # Extract the text from an HTML page (no tags)
    def gettextonly(self,soup):
        v=soup.string
        if v==None:
            c=soup.contents
            resulttext=''
            for t in c:
                subtext=self.gettextonly(t)
                resulttext+=subtext+'\n'
            return resulttext
        else:
            return v.strip()


    # Sepetate the words by any non-whitespace character
    def separatewords(self, text):
        splitter=re.compile('\\W*')
        return [s.lower() for s in splitter.split(text) if s!='']

    # Return true if this url is already indexed
    def isindexed(self, url):
        u=self.con.execute("select rowid from urllist where url='%s'" % url).fetchone()
        if u!=None:
            # Check if it has actually been crawled
            v=self.con.execute('select * from wordlocation where urlid=%d' % u[0]).fetchone()
            if v!=None: return True
        return False

    # Add a link between two pages
    def addlinkref(self,urlFrom,urlTo,linkText):
        pass

    # Starting with a list of pages, do a breadth first search to
    # the given depth, indexing pages as we go
    def crawl(self,pages,depth=2):
        pass

    # Create the database tables
    def createindextables(self):
        pass

    def crawl(self,pages,depth=2):
        for i in range(depth):
            newpages=set()
            for page in pages:
                try:
                    c=urllib2.urlopen(page)
                except:
                    print "Could not open %s" % page
                    continue
                soup=BeautifulSoup(c.read())
                self.addtoindex(page,soup)

                links=soup('a')
                for link in links:
                    if ('href' in dict(link.attrs)):
                        url=urljoin(page,link['href'])
                        if url.find("'")!=-1: continue
                        url=url.split('#')[0] # remove location portion
                        if url[0:4]=='http' and not self.isindexed(url):
                            newpages.add(url)
                        linkText=self.gettextonly(link)
                        self.addlinkref(page,url,linkText)

                self.dbcommit()

            pages=newpages

    # Creating index tables
    def createindextables(self):
        self.con.execute('create table urllist(url)')
        self.con.execute('create table wordlist(word)')
        self.con.execute('create table wordlocation(urlid,wordid,location)')
        self.con.execute('create table link(fromid integer,toid integer)')
        self.con.execute('create table linkwords(wordid,linkid)')
        self.con.execute('create index wordid on wordlist(word)')
        self.con.execute('create index urlid on urllist(url)')
        self.con.execute('create index wordurlidx on wordlocation(wordid)')
        self.con.execute('create index urltoidx on link(toid)')
        self.con.execute('create index urlfromidx on link(fromid)')
        self.dbcommit()

作成されたインデックス - Pythonシェルを使用したsearchindex.db

>>> reload(searchengine)
>>> crawler=searchengine.crawler('searchindex.db')
>>> crawler.createindextables( )

このように使用してみましたが、エラーが発生しています:

>>> reload(searchengine)
>>> crawler=searchengine.crawler('searchindex.db')
>>> pages=['http://kiwitobes.co/wiki/Categorical_list_of_programming_languages.html']
>>> crawler.crawl(pages)
Indexing http://www.tartarus.org/~martin/PorterStemmer/index.html

Traceback (most recent call last):
  File "<pyshell#22>", line 1, in <module>
    crawler.crawl(pages)
  File "C:/Users/dj/Desktop\searchengine.py", line 103, in crawl
    self.addtoindex(page,soup)
  File ""C:/Users/dj/Desktop\searchengine.py", line 38, in addtoindex
    words=self.separatewords(text)
  File ""C:/Users/dj/Desktop\searchengine.py", line 68, in separatewords
    splitter=re.compile('\\W*')
NameError: global name 're' is not defined

Python バージョン: 2.7、OS: Windows 8

4

1 に答える 1

0

コードでモジュールを使用しreます。

def separatewords(self, text):
    splitter=re.compile('\\W*')
    # here --^
    return [s.lower() for s in splitter.split(text) if s!='']

しかし、あなたが持っているのを見たことは一度もありません:

import re

reモジュールをメモリにロードします。メモリにロードされていないモジュールを使用しようとすると、NameError.

したがって、問題を解決するにimport reは、スクリプトの先頭に他のすべてのインポートを追加するだけです。

于 2013-10-06T15:18:38.940 に答える