1

これに os.walk を含めましたが、それでも .csv が出力されず、html ディレクトリを読み取っているのかどうかわかりません。情報 (URL、電子メール、名前、電話など) をスクレイピングして .csv に出力する必要があるオフライン Web サイトのディレクトリがあります。これを実行すると (実行可能ではないことはわかっています)、13 行目でアクセス許可拒否エラーが発生してハングします。

import os, csv
from bs4 import BeautifulSoup


def main(folder, outputfile):
    with open(outputfile, "wb") as f:
        w = csv.writer(f)
        header = ("Tag", "Name", "Name", "Email", "Phone", "Location", "URL")
        w.writerow(header)
        rows = crawlhtmls(folder)
        w.writerows(rows)

def crawlhtmls(folder):
        for root, dirs, files in os.walk(folder):
    for f in files:
        if f.lower().endswith(".html"):
        soup = BeautifulSoup(f.read())
        events = soup.findAll('div', attrs={'class': 'post'})
        headline = x.find('h2')
        name = x.find('')
        email = x.find('address') 
        phone = x.find('tel')
        description = x.find('div', attrs={'class': 'entry'})

        headline2 = str(headline)
        name2 = str(name)
        email2 = str(name)
        phone2 = str(phone)

        description2 = str(description)

        headline3 = headline2.replace(",", " -")
        name3 = name2.replace(",", " -")
        email3 = email2.replace(",", " -")
        phone3 = phone2.replace(",", " -")
        description3 = description2.replace(",", " -")

        headline4 = headline3.replace('<h2 class', "")
        headline5 = headline4.replace('</h2>', "")
        headline6 = headline5.replace('- ', "")
        headline7 = headline6.replace("at ", "")

        description4 = description3.replace('[<p>', "")
        description5 = description4.replace('</p>]', "")
        description6 = description5.replace('\n', " ")
        description7 = description6.replace('[]', "")

        link4 = link3.replace('<a href', "")
        link5 = link4.replace('</a>', "")
        link6 = link5.replace('h2', " ")
        link7 = link6.replace('=', "")

        seq = (headline7, name3, email3, phone3, descripton7)
        yield seq

if __name__ == "__main__":
    folderPath = r"C:\projects\training\html" 
    output = r"C:\projects\training\about.csv" 
    main(folderPath, output)
4

0 に答える 0