Web サイトから解析したすべてのデータを csv ファイルに転送しようとしていますが、いくつかの問題が発生しました。
1.文字エンコーディングを追加しても、プレーンテキストではなくExcelでHTMLとして出力されます。
例えば
<option redirectvalue="/partfinder/Asus/All In One/E Series/ET10B">ET10B</option>
2.すべての行ではなく、1列に出力されます
これまでの私のコードは次のとおりです。
import string, urllib2, urlparse, csv, sys, codecs, cStringIO
from urllib import quote
from urlparse import urljoin
from bs4 import BeautifulSoup
from ast import literal_eval
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
changable_url = 'http://www.asusparts.eu/partfinder/Asus/All%20In%20One/E%20Series'
page = urllib2.urlopen(changable_url)
base_url = 'http://www.asusparts.eu'
soup = BeautifulSoup(page)
selects = []
redirects = []
model_info = []
#Opening csv writer
c = UnicodeWriter(open(r"asus_stock.csv", "wb"))
#Object reader
cr = UnicodeWriter(open(r"asus_stock.csv", "rb"))
print "FETCHING OPTIONS"
select = soup.find(id='myselectListModel')
selects.append(select)
for item in selects:
print item.get_text()
options = select.findAll('option')
for option in options:
if(option.has_attr('redirectvalue')):
redirects.append(option['redirectvalue'])
for r in redirects:
rpage = urllib2.urlopen(urljoin(base_url, quote(r)))
s = BeautifulSoup(rpage)
#print s
#Fetching the main title for each specific model and printing it out
print "FETCHING MAIN TITLE"
maintitle = s.find(id='puffBreadCrumbs')
model_info.append(maintitle)
print maintitle.get_text()
datas = s.find(id='accordion')
a = datas.findAll('a')
content = datas.findAll('span')
print "FETCHING CATEGORY"
for data in a:
if(data.has_attr('onclick')):
arguments = literal_eval('(' + data['onclick'].replace(', this', '').split('(', 1)[1])
#model_info.append(arguments)
print arguments #arguments[1] + " " + arguments[3] + " " + arguments[4]
# Retrieves Part number and Price
print "FETCHING DATA"
for complete in content:
if(complete.has_attr('class')):
#model_info.append(complete['class'])
print complete.get_text()
print "FETCHING IMAGES"
img = s.find('td')
images = img.findAll('img')
model_info.append(images)
print images
c.writerows(selects)
次のように出力されるようにするにはどうすればよいですか
1-HTML ではなくテキスト
1 列ではなく 2 行
[編集] これは、CSVファイルを表示し、値の例を返す方法です
"Brand Name" "CategoryID" "ModelID" "Family" "Name" "Part Number" "Price" "Image src"
Asus | AC Adapter | ET1602 | E Series | Power Cord 3P L:80CM,UK(B) | 14G110008350 |14.77 | image src
[新しい編集]
これらは、印刷された値の出力です。
print "FETCHING OPTIONS"
select = soup.find(id='myselectListModel')
selects.append(select)
for item in selects:
print item.get_text()
収量:
ET10B ET1602 ET1602C etc..
メイン タイトルを取得中:
print "FETCHING MAIN TITLE"
maintitle = s.find(id='puffBreadCrumbs')
model_info.append(maintitle)
print maintitle.get_text()
収量:
Asus - オールインワン - E シリーズ - ET10B
カテゴリを取得しています
datas = s.find(id='accordion')
a = datas.findAll('a')
content = datas.findAll('span')
print "FETCHING CATEGORY"
for data in a:
if(data.has_attr('onclick')):
arguments = literal_eval('(' + data['onclick'].replace(', this', '').split('(', 1)[1])
#model_info.append(arguments)
print arguments
収量:
FETCHING CATEGORY
('Asus', 'AC Adapter', 'ET10B', '6941', 'E Series')
('Asus', '04G265003580')
('Asus', '14G110008340')
('Asus', 'Bracket', 'ET10B', '7138', 'E Series')
('Asus', 'Cable', 'ET10B', '6983', 'E Series')
('Asus', 'Camera', 'ET10B', '6985', 'E Series')
('Asus', 'Cooling', 'ET10B', '6999', 'E Series')
('Asus', 'Cover', 'ET10B', '6984', 'E Series')
etc..
名前の取得:
print "FETCHING NAME"
name = s.find('b').get_text()
print name
収量:
電源アダプター 65W19V 3PIN
部品番号と価格の取得
print "FETCHING PART NUMBER AND PRICE (inc. VAT)"
for complete in content:
if(complete.has_attr('class')):
#model_info.append(complete['class'])
print complete.get_text()
収量:
FETCHING PART NUMBER AND PRICE (inc. VAT)
Part number: 04G265003580
Remote stock
38.09:- EUR
画像の取得
print "FETCHING IMAGES"
img = s.find('td')
images = img.findAll('img')
model_info.append(images)
print images
収量:
FETCHING IMAGES
[<img alt="" src="/images/Articles/thumbs/04G265003580_thumb.jpg"/>]