python - 私の Scrapy プロジェクトがスクレイピングを停止するのに、それでも Web サイトをうまくスクローリングする理由

Question

Scrapy を使用して、stox.vn からデータを収集しています。約 800 個の URL を持つ urls.txt があり、すべての URL をボットに渡します。しかし、最初はよく這ってこすります。しかし、それはスクレイプを止めてクロールするだけです。

2013-06-27 03:24:28+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial/PV_Index?filter=1&unit=1000000&ticker=AAA> (referer: http://companyaz.stox.vn/Financial?cId=746&iId=150&iIdL=147&eId=1&tId=2status=1&id=-1&cat=&ticker=AAA)
2013-06-27 03:24:28+0700 [stox] DEBUG: Scraped from <200 http://companyaz.stox.vn/Financial/PV_Index?filter=1&unit=1000000&ticker=AAA>

    {'chi_phi_ban_hang': u'-7453.41',
     'chi_phi_khau_hao_TSCD': u'11890.11',
     'chi_phi_quan_ly': u'-5913.60',
     'chi_phi_tai_chinh': u'-10677.99',
     'chi_phi_tien_lai_vay': u'-5672.17',
     'doanh_thu_thuan': u'122008.75',
     'gia_von_hang_ban': u'-90790.07',
     'lai_co_dong_ct_me': u'11885.60',
     'lai_gop': u'31218.69',
     'lai_sau_thue': u'11885.60',
     'lai_tu_hdkd': u'11376.31',
     'loi_ich_CDTS': u'11885.60',
     'qtime': u'20101',
     'thu_nhap_tai_chinh': u'4202.63',
     'thue_TNDN_hl': u'509.29',
     'thue_TNDN_ht': u'0',
     'ticker': 'AAA'}
.....
2013-06-27 03:24:31+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=446&iId=292&iIdL=280&eId=3&tId=3status=1&id=-1&cat=&ticker=ABI> (referer: None)
2013-06-27 03:24:33+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=1&iId=217&iIdL=202&eId=0&tId=2status=1&id=-1&cat=&ticker=ABT> (referer: None)
2013-06-27 03:24:36+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=164&iId=289&iIdL=279&eId=1&tId=0status=1&id=-1&cat=&ticker=ACB> (referer: None)
2013-06-27 03:24:38+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=522&iId=180&iIdL=170&eId=0&tId=2status=1&id=-1&cat=&ticker=ACC> (referer: None)
2013-06-27 03:24:40+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=486&iId=180&iIdL=170&eId=3&tId=2status=1&id=-1&cat=&ticker=ACE> (referer: None)
2013-06-27 03:24:42+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=2&iId=217&iIdL=202&eId=0&tId=2status=1&id=-1&cat=&ticker=ACL> (referer: None)
2013-06-27 03:24:44+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=858&iId=256&iIdL=241&eId=1&tId=2status=1&id=-1&cat=&ticker=ADC> (referer: None)
2013-06-27 03:24:47+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=556&iId=180&iIdL=170&eId=3&tId=2status=1&id=-1&cat=&ticker=ADP> (referer: None)

ここで私がやっていること in stox/spider/test.py

from scrapy import log
import logging
from scrapy.log import ScrapyFileLogObserver
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from stox.items import StoxItem
from scrapy.http import Request, Response
from scrapy.http.cookies import CookieJar
from scrapy.contrib.exporter import CsvItemExporter

class MySpider(BaseSpider):
    name = "stox"
    allowed_domains = ["stox.vn"]    
    start_urls =["http://companyaz.stox.vn/Financial?cId=113&iId=217&iIdL=202&eId=0&tId=2&status=1&id=-1&cats=&ticker=FPT",
        "http://companyaz.stox.vn/Financial?cId=113&iId=217&iIdL=202&eId=0&tId=2&status=1&id=-1&cats=&ticker=SSC"]    
    ticker = "";
    items = [];

    def __init__(self):
        #write log file here
        logfile = open('testlog.log', 'w')
        log_observer = ScrapyFileLogObserver(logfile, level=logging.DEBUG)
        log_observer.start() #start logging

    def start_requests(self):
        products = []
        #with open('urls.txt', 'rb') as urls:
        #    for url in urls:
        #        yield Request(url, self.parse)

        # extract url file and call parse()
        f = open("urls.txt")
        start_urls = [url.strip() for url in f.readlines()]
        f.close()
        for url in start_urls:
            yield Request(url, self.parse)

    def parse(self, response):


        hxs = HtmlXPathSelector(response)        
        self.ticker = "".join(hxs.select("//div[@class='stock-ticker-title']/label/text()").extract()).strip()

        my_start_url = "http://companyaz.stox.vn/Financial/PV_Index?filter=1&unit=1000000&ticker=%s" % self.ticker

        #get the cookie of start_url
        cookieJar = response.meta.setdefault('cookie_jar', CookieJar())
        cookieJar.extract_cookies(response, response.request)

        request = Request(my_start_url, callback = self.extractItem,
                          meta = {'dont_merge_cookies': True, 'cookie_jar': cookieJar})

        cookieJar.add_cookie_header(request) # apply Set-Cookie ourselves
        yield request        


    def extractItem(self, response):
        items = [];

        #extract ticker from url
        pos =  response.url.find('ticker=')
        l = len("ticker=")
        ticker = response.url[pos+l:]

        f = open("data/%s.csv" % ticker, 'w')

        #get the XPath        
        hxs = HtmlXPathSelector(response)        
        titles = hxs.select("//p[@data-time]/..")                
        for title in titles:
            item = StoxItem()
            item ["ticker"] = ticker;
            item ["qtime"] = "".join(title.select("./p/@data-time").extract())
            item ["doanh_thu_thuan"] = ''.join(title.select("./div[1]/p[1]/text()").extract()).strip().replace('.','').replace(',','.')
            item ["gia_von_hang_ban"] = ''.join(title.select("./div[1]/p[2]/text()").extract()).strip().replace('.','').replace(',','.')#.encode('utf-8')                        
            item ["lai_gop"] = ''.join(title.select("./div[2]/p[1]/text()").extract()).strip().replace('.','').replace(',','.')
            item ["thu_nhap_tai_chinh"] = ''.join(title.select("./div[2]/p[2]/text()").extract()).strip().replace('.','').replace(',','.')
            item ["chi_phi_tai_chinh"] = ''.join(title.select("./div[2]/p[3]/text()").extract()).strip().replace('.','').replace(',','.') 
            item ["chi_phi_tien_lai_vay"] = ''.join(title.select("./div[2]/p[4]/text()").extract()).strip().replace('.','').replace(',','.')
            item ["chi_phi_ban_hang"] = ''.join(title.select("./div[2]/p[5]/text()").extract()).strip().replace('.','').replace(',','.')
            item ["chi_phi_quan_ly"] = ''.join(title.select("./div[2]/p[6]/text()").extract()).strip().replace('.','').replace(',','.')
            item ["lai_tu_hdkd"] = ''.join(title.select("./div[3]/p[1]/text()").extract()).strip().replace('.','').replace(',','.')
            item ["thue_TNDN_ht"] = ''.join(title.select("./div[3]/p[2]/text()").extract()).strip().replace('.','').replace(',','.')
            item ["thue_TNDN_hl"] = ''.join(title.select("./div[3]/p[3]/text()").extract()).strip().replace('.','').replace(',','.')
            item ["lai_sau_thue"] = ''.join(title.select("./div[4]/p[1]/text()").extract()).strip().replace('.','').replace(',','.')
            item ["loi_ich_CDTS"] = ''.join(title.select("./div[4]/p[1]/text()").extract()).strip().replace('.','').replace(',','.')
            item ["lai_co_dong_ct_me"] = ''.join(title.select("./div[5]/p[1]/text()").extract()).strip().replace('.','').replace(',','.')
            item ["chi_phi_khau_hao_TSCD"] = ''.join(title.select("./div[6]/p[1]/text()").extract()).strip().replace('.','').replace(',','.')            
            items.append(item)

            #write to file
            str = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n" % (item ["ticker"],
                                                                            item ["qtime"],
                                                                            item ["doanh_thu_thuan"],
                                                                            item ["gia_von_hang_ban"],
                                                                            item ["lai_gop"],
                                                                            item ["thu_nhap_tai_chinh"],
                                                                            item ["chi_phi_tai_chinh"],
                                                                            item ["chi_phi_tien_lai_vay"],
                                                                            item ["chi_phi_ban_hang"],
                                                                            item ["chi_phi_quan_ly"],
                                                                            item ["lai_tu_hdkd"],
                                                                            item ["thue_TNDN_ht"],
                                                                            item ["thue_TNDN_hl"],
                                                                            item ["lai_sau_thue"],
                                                                            item ["loi_ich_CDTS"],
                                                                            item ["lai_co_dong_ct_me"],
                                                                            item ["chi_phi_khau_hao_TSCD"])
            f.write(str)

        #print "Item %r " %items;
        f.close()
        return items

私の設定.py

BOT_NAME = 'stox'

SPIDER_MODULES = ['stox.spiders']
NEWSPIDER_MODULE = 'stox.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'stox (+http://www.yourdomain.com)'
#ITEM_PIPELINES = ['stox.pipelines.StoxPipeline']
DOWNLOAD_DELAY = 2
#DOWNLOAD_TIMEOUT = 180
#CONCURRENT_REQUESTS = 2

パラメータ CONCURRENT_REQUESTS を変更すると、CONCURENT_REQUEST 回のスクレイピング後に停止し、クロールのみになることを確認します。Concurrentプロセスに問題があると思います（プロセスを解放していませんか???）

UPDATED urls.txt の内容

http://companyaz.stox.vn/Financial?cId=746&iId=150&iIdL=147&eId=1&tId=2status=1&id=-1&cat=&ticker=AAA
http://companyaz.stox.vn/Financial?cId=446&iId=292&iIdL=280&eId=3&tId=3status=1&id=-1&cat=&ticker=ABI
http://companyaz.stox.vn/Financial?cId=1&iId=217&iIdL=202&eId=0&tId=2status=1&id=-1&cat=&ticker=ABT
.....

どんな支援も大歓迎です！ありがとうございました。

PS: 私は Scrapy プロジェクトに非常に慣れていないため、英語が不足していて申し訳ありません

score 0 · Accepted Answer

あなたの 800 個の URL のうち、ティッカー名をファイルとして書き込んでいます。ティッカー名はすべての URL で異なっていますか? それらが明確でない場合は、ファイルを上書きしている可能性があります。ファイルに書き込む代わりに、エクスポートオプションを使用できます。

次のスレッドを読んで、データのエクスポートについて知ることができます。 Scrapy : データの保存

python - 私の Scrapy プロジェクトがスクレイピングを停止するのに、それでも Web サイトをうまくスクローリングする理由

1 に答える 1

Related

Reference