python - Python で BeautifulSoup を使用してデータを解析する

Question

Web サイトのデータを解析する必要があります: http://www.sarkari-naukri.in/jobs-by-qualification/b-tech/sub-centre-manager.html

BeautifulSoup のチュートリアルのほとんどは、リンクを解析するためのものであり、リンクから必要なデータを詳細に解析するためのものではありません。

今、私はpythonのBeautifulSoupモジュールのいくつかのチュートリアルを経て、必要なデータ文字列をダウンロードするためにこのスクリプトを書きました

 <div id="content_box">
        <div id="content" class="hfeed">...

私が使用しているスクリプト：

from BeautifulSoup import BeautifulSoup
import urllib2

def main():
    url = "http://www.sarkari-naukri.in/jobs-by-qualification/b-tech/sub-centre-manager.html"
    data = urllib2.urlopen(url).read()
    bs = BeautifulSoup(data)

    postdata = bs.find('div', {'id': 'content_box'})
    postdata= [s.getText().strip() for s in postdata.findAll('div', {'class':'scdetail'})]

    fname = 'postdata.txt'
    with open(fname, 'w') as outf:
        outf.write('\n'.join(postdata))

if __name__=="__main__":
    main()

しかし、このスクリプトは私が期待するものを実行しません。賢明なように投稿データをファイルに取得したい：

タイトル: チャンディーガル国立電子情報技術研究所のサブセンターマネージャーの欠員

サブセンターマネージャー

電子情報技術総合研究所

住所: NIELIT, Chandigarh SCO: 114-116 Sector 17B

郵便番号: 160017

都市チャンディーガルなど....

助けたり提案したりしてください。

ありがとう

score 0 · Accepted Answer

この pyparsing エクストラクタは、一致する div/span タグを選択します。

from pyparsing import makeHTMLTags, withAttribute, SkipTo

"""
sample:
<div class="scheading">Postal Code: <span class="scdetail" 
    itemprop="postalCode">160017</span></div>
"""
div,divEnd = makeHTMLTags("div")
span,spanEnd = makeHTMLTags("span")
div.setParseAction(withAttribute(("class","scheading")))
span.setParseAction(withAttribute(("class","scdetail")))

patt = (div + SkipTo(span)("label") + span + SkipTo(spanEnd)("value") + 
            spanEnd + divEnd)

attrs = {}
for match in patt.searchString(html):
    attrs[match.itemprop] = (match.label[0].strip(), match.value)

from pprint import pprint
pprint(attrs.items())

プリント:

[('skills',
  ('Desired Skills:',
   'Preference will be given to candidates having good knowledge of UNIX &amp; Visual FoxPro.')),
 ('qualifications',
  ('Qualifications:',
   '\x91A\x92 level of DOEACC / PGDCA with 2 years experience. ')),
 ('educationRequirements',
  ('Educational Requirements:',
   'B. E. / B. Tech. (CS / IT / Electronics) / MCA / M. Sc. (CS / IT / Electronics) / \x91B\x92 level of DOEACC ')),
 ('addressLocality', ('City', 'Chandigarh')),
 ('addressRegion', ('State', 'Haryana and Punjab')),
 ('streetAddress', ('Address:', 'NIELIT, Chandigarh SCO: 114-116 Sector 17B')),
 ('postalCode', ('Postal Code:', '160017')),
 ('baseSalary', ('Pay Scale:', 'Rs. 15,000/-'))]

score 0 · Accepted Answer

このソリューションは BeautifulSoup を使用しています

import os
import sys

# Import System libraries
import re
import urllib2

# Import Custom libraries
from BeautifulSoup import BeautifulSoup, Tag

job_location = lambda x: x.name == "div" and set([(u"id", u"content")]) <= set(x.attrs)
job_title_location = lambda x: set([(u"class", u"schema_title"), (u"itemprop", u"title")]) <= set(x.attrs)
organ_location = lambda x: set([(u"class", u"schema_hiringorganization"), (u"itemprop", u"name")]) <= set(x.attrs)
details_key_location = lambda x: x.name == "div" and bool(re.search("s.*heading", dict(x.attrs).get(u"class", "")))

def coll_up(ilist,base=0,count=0):
    '''
    Recursively collapse nested lists at depth base and above
    '''
    tlist = []
    if(isinstance(ilist,list) or isinstance(ilist,tuple)):
        for q in ilist:
            tlist += coll_up(q,base,count+1)
    else:
        if(base > count):
            tlist = ilist
        else:
            tlist = [ilist]
    return [tlist] if((count != 0) and (base > count)) else tlist

def info_extract(ilist, count=0):
    '''
    Recursively walk a nested list and upon finding a non iterable, return its string
    '''
    tlist = []
    if(isinstance(ilist, list)):
        for q in ilist:
            if(isinstance(q, Tag)):
                tlist += info_extract(q.contents, count+1)
            else:
                extracted_str = q.strip()
                if(extracted_str):
                    tlist += [extracted_str]
    return [tlist] if(count != 0) else tlist

def main():
    url = "http://www.sarkari-naukri.in/jobs-by-qualification/b-tech/sub-centre-manager.html"
    data = urllib2.urlopen(url).read()
    soup = BeautifulSoup(data)

    job_tags = soup.findAll(job_location)
    if(job_tags):
        job_tag = job_tags[0]
        job_title = info_extract(job_tag.findAll(job_title_location))[0]
        organ = info_extract(job_tag.findAll(organ_location))[0]
        details = coll_up(info_extract(job_tag.findAll(details_key_location)), 2)

        combined_dict = dict([tuple(["Job Title:"] + job_title)] + [tuple(["Organisation:"] + organ)] + [tuple(detail) for detail in details])
        combined_list = [["Job Title:"] + job_title, ["Organisation:"] + organ] + details
        postdata = [" ".join(x) for x in combined_list]
        print postdata

        fname = "postdata.txt"
        with open(fname, "w") as outf:
            outf.write("\n".join(postdata).encode("utf8"))

if __name__=="__main__":
    main()

python - Python で BeautifulSoup を使用してデータを解析する

3 に答える 3

Related

Reference