python - URL を読み取った後にデータのブロックをキャプチャする

Question

以下に予想される出力があります。URL を読み込もうとしていますが、正常に読み取ることができますが、ブロック「コンボ」の下でデータをキャプチャしようとすると、エラーが発生します。これを解決する方法についての入力はありますか?

# Version YYYYMMDD
version = "20121112"

# File type to be output to logs
# Should be changed to exe before building the exe.
fileType = "py"

# Import sys to read command line arguments
import sys, getopt
#import pdb
#pdb.set_trace()

import argparse
import urllib
import urllib2
import getpass
import re

def update (url):
    print url

    authhost = 'https://login.company.com'
    # Siteminder test server
    user = getpass.getuser()
    password = getpass.getpass()
    realm = None

    # handle the authentication and cookies
    cookiehand = urllib2.HTTPCookieProcessor()
    password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
    password_mgr.add_password(user=user,
                              passwd=password,
                              uri=authhost,
                              realm=realm)
    auth_handler = urllib2.HTTPBasicAuthHandler(password_mgr)
    opener = urllib2.build_opener(auth_handler, cookiehand)
    urllib2.install_opener(opener)
    #make the request
    req = urllib2.Request(url=url)
    try:
        f = urllib2.urlopen(req)
        txt = f.read()
        f.close()
    except urllib2.HTTPError, e:
        txt = ''
        print 'An error occured connecting to the wiki. No wiki page will be generated.'
        return '<font color=\"red\">QWiki</font>'
    # Find the start tag of the textarea with Regular Expressions
    print txt
    p = re.compile('<Combo[^>]*>')
    m = p.search(txt)
    (tagStart, tagEnd) = m.span()
    # Find the end of the textarea
    endTag = txt.index("</textarea>")

def main ():
    #For logging
    print "test"
    parser = argparse.ArgumentParser(description='This is the update.py script created by test')
    parser.add_argument('-u','--url',action='store',dest='url',default=None,help='<Required> url link',required=True)
    results = parser.parse_args()# collect cmd line args
    url = results.url
    #print url
    update(url)
if __name__ == '__main__':
    main()

現在の出力:-

C:\Dropbox\scripts>python announce_update.py --u "http://qwiki.company.com/component/w/index.php?title=Test1&action=raw"
test
http://qwiki.company.com/component/w/index.php?title=Test1&action=raw
Password:
==== <font color="#008000">Combo</font> ====

{| border="1" cellspacing="1" cellpadding="1"
|-
! bgcolor="#67B0F9" scope="col" | test1
! bgcolor="#67B0F9" scope="col" | test2
! bgcolor="#67B0F9" scope="col" | test3
! bgcolor="#67B0F9" scope="col" | test4
|-
| [http:link.com]
|}

==== <font color="#008000">COde:</font> ====
Traceback (most recent call last):
  File "announce_update.py", line 66, in <module>
    main()
  File "announce_update.py", line 64, in main
    update(url)
  File "announce_update.py", line 52, in update
    (tagStart, tagEnd) = m.span()
AttributeError: 'NoneType' object has no attribute 'span'

期待される出力:-

{| border="1" cellspacing="1" cellpadding="1"
|-
! bgcolor="#67B0F9" scope="col" | test1
! bgcolor="#67B0F9" scope="col" | test2
! bgcolor="#67B0F9" scope="col" | test3
! bgcolor="#67B0F9" scope="col" | test4
|-
| [http:link.com]
|}

score 1 · Accepted Answer

p.search(txt)Noneテキスト内にパターンpが見つからない場合に返されますtxt。None.spanエラーにつながります。

HTMLの最初の要素からテキストを抽出するには、正規表現の代わりに (html パーサー) を<textarea>使用できます。BeautifulSoup

from bs4 import BeautifulSoup # pip install beautifulsoup4

soup = BeautifulSoup(txt)
print(soup.textarea.string)

HTMLParserstdlibのみを使用して同じことを試みることができます。

#!/usr/bin/env python
import cgi

try:
    from html.parser import HTMLParser
except ImportError: # Python 2
    from HTMLParser import HTMLParser

try:
    from urllib.request import urlopen
except ImportError: # Python 2
    from urllib2 import urlopen

url = 'http://qwiki.company.com/component/w/index.php?title=Test1&action=raw'
tag = 'textarea'

class Parser(HTMLParser):
    """Extract tag's text content from html."""
    def __init__(self, html, tag):
        HTMLParser.__init__(self)
        self.contents = []
        self.intag = None
        self.tag = tag
        self.feed(html)

    def handle_starttag(self, tag, attrs):
        self.intag = (tag == self.tag)
    def handle_endtag(self, tag):
        self.intag = False
    def handle_data(self, data):
        if self.intag:
            self.contents.append(data)

# download and convert to Unicode
response = urlopen(url)
_, params = cgi.parse_header(response.headers.get('Content-Type', ''))
html = response.read().decode(params['charset'])

# parse html (extract text from the first `<tag>` element)
content = Parser(html, tag).contents[0]
print(content)

score 0 · Accepted Answer

このエラーは、文字列mが空であるか定義されていないことを示しています。

さらに、正規表現はの閉じ括弧で停止するため、とにかく正しいテキストを見つけられないようです</font>。

http://docs.python.org/2/howto/regex.htmlreで使用に関する素敵なリファレンスを見つけました

それを読んだ後、このような表現が必要だと思います

p = re.compile(r'>Combo<.*({.*})');

rto は文字列を表しraw、バックスラッシュなどを解釈しないように Python に指示することに注意してください。かっこで「グループ」を作成したので、「一致するこのビットのみ」を抽出できます。で検索すると

m = p.match();

>Combo<に続く中かっこの最初のセットのビットだけを抽出できるはずです

myText = m.group(1);

これは完璧ではないかもしれませんが、非常に近いはずです。「>Combo< の後の最初の左中括弧から次の右中括弧まで」を見つける必要があることを示しようとしています。かっこは「これは私が欲しいビットです」を示すためのものであり、インデックス付きはgroupそれをオブジェクトから抽出しmatchます。

python - URL を読み取った後にデータのブロックをキャプチャする

2 に答える 2

Related

Reference