Scrapy を使用して、stox.vn からデータを収集しています。約 800 個の URL を持つ urls.txt があり、すべての URL をボットに渡します。しかし、最初はよく這ってこすります。しかし、それはスクレイプを止めてクロールするだけです。
2013-06-27 03:24:28+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial/PV_Index?filter=1&unit=1000000&ticker=AAA> (referer: http://companyaz.stox.vn/Financial?cId=746&iId=150&iIdL=147&eId=1&tId=2status=1&id=-1&cat=&ticker=AAA)
2013-06-27 03:24:28+0700 [stox] DEBUG: Scraped from <200 http://companyaz.stox.vn/Financial/PV_Index?filter=1&unit=1000000&ticker=AAA>
{'chi_phi_ban_hang': u'-7453.41',
'chi_phi_khau_hao_TSCD': u'11890.11',
'chi_phi_quan_ly': u'-5913.60',
'chi_phi_tai_chinh': u'-10677.99',
'chi_phi_tien_lai_vay': u'-5672.17',
'doanh_thu_thuan': u'122008.75',
'gia_von_hang_ban': u'-90790.07',
'lai_co_dong_ct_me': u'11885.60',
'lai_gop': u'31218.69',
'lai_sau_thue': u'11885.60',
'lai_tu_hdkd': u'11376.31',
'loi_ich_CDTS': u'11885.60',
'qtime': u'20101',
'thu_nhap_tai_chinh': u'4202.63',
'thue_TNDN_hl': u'509.29',
'thue_TNDN_ht': u'0',
'ticker': 'AAA'}
.....
2013-06-27 03:24:31+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=446&iId=292&iIdL=280&eId=3&tId=3status=1&id=-1&cat=&ticker=ABI> (referer: None)
2013-06-27 03:24:33+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=1&iId=217&iIdL=202&eId=0&tId=2status=1&id=-1&cat=&ticker=ABT> (referer: None)
2013-06-27 03:24:36+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=164&iId=289&iIdL=279&eId=1&tId=0status=1&id=-1&cat=&ticker=ACB> (referer: None)
2013-06-27 03:24:38+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=522&iId=180&iIdL=170&eId=0&tId=2status=1&id=-1&cat=&ticker=ACC> (referer: None)
2013-06-27 03:24:40+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=486&iId=180&iIdL=170&eId=3&tId=2status=1&id=-1&cat=&ticker=ACE> (referer: None)
2013-06-27 03:24:42+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=2&iId=217&iIdL=202&eId=0&tId=2status=1&id=-1&cat=&ticker=ACL> (referer: None)
2013-06-27 03:24:44+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=858&iId=256&iIdL=241&eId=1&tId=2status=1&id=-1&cat=&ticker=ADC> (referer: None)
2013-06-27 03:24:47+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=556&iId=180&iIdL=170&eId=3&tId=2status=1&id=-1&cat=&ticker=ADP> (referer: None)
ここで私がやっていること in stox/spider/test.py
from scrapy import log
import logging
from scrapy.log import ScrapyFileLogObserver
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from stox.items import StoxItem
from scrapy.http import Request, Response
from scrapy.http.cookies import CookieJar
from scrapy.contrib.exporter import CsvItemExporter
class MySpider(BaseSpider):
name = "stox"
allowed_domains = ["stox.vn"]
start_urls =["http://companyaz.stox.vn/Financial?cId=113&iId=217&iIdL=202&eId=0&tId=2&status=1&id=-1&cats=&ticker=FPT",
"http://companyaz.stox.vn/Financial?cId=113&iId=217&iIdL=202&eId=0&tId=2&status=1&id=-1&cats=&ticker=SSC"]
ticker = "";
items = [];
def __init__(self):
#write log file here
logfile = open('testlog.log', 'w')
log_observer = ScrapyFileLogObserver(logfile, level=logging.DEBUG)
log_observer.start() #start logging
def start_requests(self):
products = []
#with open('urls.txt', 'rb') as urls:
# for url in urls:
# yield Request(url, self.parse)
# extract url file and call parse()
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
for url in start_urls:
yield Request(url, self.parse)
def parse(self, response):
hxs = HtmlXPathSelector(response)
self.ticker = "".join(hxs.select("//div[@class='stock-ticker-title']/label/text()").extract()).strip()
my_start_url = "http://companyaz.stox.vn/Financial/PV_Index?filter=1&unit=1000000&ticker=%s" % self.ticker
#get the cookie of start_url
cookieJar = response.meta.setdefault('cookie_jar', CookieJar())
cookieJar.extract_cookies(response, response.request)
request = Request(my_start_url, callback = self.extractItem,
meta = {'dont_merge_cookies': True, 'cookie_jar': cookieJar})
cookieJar.add_cookie_header(request) # apply Set-Cookie ourselves
yield request
def extractItem(self, response):
items = [];
#extract ticker from url
pos = response.url.find('ticker=')
l = len("ticker=")
ticker = response.url[pos+l:]
f = open("data/%s.csv" % ticker, 'w')
#get the XPath
hxs = HtmlXPathSelector(response)
titles = hxs.select("//p[@data-time]/..")
for title in titles:
item = StoxItem()
item ["ticker"] = ticker;
item ["qtime"] = "".join(title.select("./p/@data-time").extract())
item ["doanh_thu_thuan"] = ''.join(title.select("./div[1]/p[1]/text()").extract()).strip().replace('.','').replace(',','.')
item ["gia_von_hang_ban"] = ''.join(title.select("./div[1]/p[2]/text()").extract()).strip().replace('.','').replace(',','.')#.encode('utf-8')
item ["lai_gop"] = ''.join(title.select("./div[2]/p[1]/text()").extract()).strip().replace('.','').replace(',','.')
item ["thu_nhap_tai_chinh"] = ''.join(title.select("./div[2]/p[2]/text()").extract()).strip().replace('.','').replace(',','.')
item ["chi_phi_tai_chinh"] = ''.join(title.select("./div[2]/p[3]/text()").extract()).strip().replace('.','').replace(',','.')
item ["chi_phi_tien_lai_vay"] = ''.join(title.select("./div[2]/p[4]/text()").extract()).strip().replace('.','').replace(',','.')
item ["chi_phi_ban_hang"] = ''.join(title.select("./div[2]/p[5]/text()").extract()).strip().replace('.','').replace(',','.')
item ["chi_phi_quan_ly"] = ''.join(title.select("./div[2]/p[6]/text()").extract()).strip().replace('.','').replace(',','.')
item ["lai_tu_hdkd"] = ''.join(title.select("./div[3]/p[1]/text()").extract()).strip().replace('.','').replace(',','.')
item ["thue_TNDN_ht"] = ''.join(title.select("./div[3]/p[2]/text()").extract()).strip().replace('.','').replace(',','.')
item ["thue_TNDN_hl"] = ''.join(title.select("./div[3]/p[3]/text()").extract()).strip().replace('.','').replace(',','.')
item ["lai_sau_thue"] = ''.join(title.select("./div[4]/p[1]/text()").extract()).strip().replace('.','').replace(',','.')
item ["loi_ich_CDTS"] = ''.join(title.select("./div[4]/p[1]/text()").extract()).strip().replace('.','').replace(',','.')
item ["lai_co_dong_ct_me"] = ''.join(title.select("./div[5]/p[1]/text()").extract()).strip().replace('.','').replace(',','.')
item ["chi_phi_khau_hao_TSCD"] = ''.join(title.select("./div[6]/p[1]/text()").extract()).strip().replace('.','').replace(',','.')
items.append(item)
#write to file
str = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n" % (item ["ticker"],
item ["qtime"],
item ["doanh_thu_thuan"],
item ["gia_von_hang_ban"],
item ["lai_gop"],
item ["thu_nhap_tai_chinh"],
item ["chi_phi_tai_chinh"],
item ["chi_phi_tien_lai_vay"],
item ["chi_phi_ban_hang"],
item ["chi_phi_quan_ly"],
item ["lai_tu_hdkd"],
item ["thue_TNDN_ht"],
item ["thue_TNDN_hl"],
item ["lai_sau_thue"],
item ["loi_ich_CDTS"],
item ["lai_co_dong_ct_me"],
item ["chi_phi_khau_hao_TSCD"])
f.write(str)
#print "Item %r " %items;
f.close()
return items
私の設定.py
BOT_NAME = 'stox'
SPIDER_MODULES = ['stox.spiders']
NEWSPIDER_MODULE = 'stox.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'stox (+http://www.yourdomain.com)'
#ITEM_PIPELINES = ['stox.pipelines.StoxPipeline']
DOWNLOAD_DELAY = 2
#DOWNLOAD_TIMEOUT = 180
#CONCURRENT_REQUESTS = 2
パラメータ CONCURRENT_REQUESTS を変更すると、CONCURENT_REQUEST 回のスクレイピング後に停止し、クロールのみになることを確認します。Concurrentプロセスに問題があると思います(プロセスを解放していませんか???)
UPDATED urls.txt の内容
http://companyaz.stox.vn/Financial?cId=746&iId=150&iIdL=147&eId=1&tId=2status=1&id=-1&cat=&ticker=AAA
http://companyaz.stox.vn/Financial?cId=446&iId=292&iIdL=280&eId=3&tId=3status=1&id=-1&cat=&ticker=ABI
http://companyaz.stox.vn/Financial?cId=1&iId=217&iIdL=202&eId=0&tId=2status=1&id=-1&cat=&ticker=ABT
.....
どんな支援も大歓迎です!ありがとうございました。
PS: 私は Scrapy プロジェクトに非常に慣れていないため、英語が不足していて申し訳ありません