これに os.walk を含めましたが、それでも .csv が出力されず、html ディレクトリを読み取っているのかどうかわかりません。情報 (URL、電子メール、名前、電話など) をスクレイピングして .csv に出力する必要があるオフライン Web サイトのディレクトリがあります。これを実行すると (実行可能ではないことはわかっています)、13 行目でアクセス許可拒否エラーが発生してハングします。
import os, csv
from bs4 import BeautifulSoup
def main(folder, outputfile):
with open(outputfile, "wb") as f:
w = csv.writer(f)
header = ("Tag", "Name", "Name", "Email", "Phone", "Location", "URL")
w.writerow(header)
rows = crawlhtmls(folder)
w.writerows(rows)
def crawlhtmls(folder):
for root, dirs, files in os.walk(folder):
for f in files:
if f.lower().endswith(".html"):
soup = BeautifulSoup(f.read())
events = soup.findAll('div', attrs={'class': 'post'})
headline = x.find('h2')
name = x.find('')
email = x.find('address')
phone = x.find('tel')
description = x.find('div', attrs={'class': 'entry'})
headline2 = str(headline)
name2 = str(name)
email2 = str(name)
phone2 = str(phone)
description2 = str(description)
headline3 = headline2.replace(",", " -")
name3 = name2.replace(",", " -")
email3 = email2.replace(",", " -")
phone3 = phone2.replace(",", " -")
description3 = description2.replace(",", " -")
headline4 = headline3.replace('<h2 class', "")
headline5 = headline4.replace('</h2>', "")
headline6 = headline5.replace('- ', "")
headline7 = headline6.replace("at ", "")
description4 = description3.replace('[<p>', "")
description5 = description4.replace('</p>]', "")
description6 = description5.replace('\n', " ")
description7 = description6.replace('[]', "")
link4 = link3.replace('<a href', "")
link5 = link4.replace('</a>', "")
link6 = link5.replace('h2', " ")
link7 = link6.replace('=', "")
seq = (headline7, name3, email3, phone3, descripton7)
yield seq
if __name__ == "__main__":
folderPath = r"C:\projects\training\html"
output = r"C:\projects\training\about.csv"
main(folderPath, output)