pythonscrapyを使用してページをスクラップしようとしています。いくつかの解体操作の後、scrapyは表示を終了します
twisted.internet.error.TimeoutError error
これが私のコードです:
#infobel_spider.py
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request
from scrapy.http import FormRequest
from infobel.items import InfobelItem
import sys
import xlwt
import re
import codecs
class InfobelSpider(BaseSpider):
name = 'infobel'
start_urls = ['http://www.infobel.com/en/italy/business/20300/accessories']
def parse(self,response):
hxs = HtmlXPathSelector(response)
next_page = hxs.select("//a[@id='Pagination1_lnkNextRec']/@href").extract()
if not not next_page:
yield Request("http://www.infobel.com"+next_page[0],self.parse)
qs = hxs.select("//div[@class='result-item clearfix']")
items = []
for q in qs:
item = InfobelItem()
item['name'] = q.select('div/div/h2/a/span/text()').extract()
item['address'] = q.select('div/div/ul/li[1]/div/span/text()').extract()
item['phone'] = q.select('div/div/ul/li[2]/div/text()').extract()
item['email'] = q.select('div/div/ul/li[3]/div/a/text()').extract()
item['website'] = q.select('div/div/ul/li[4]/div/a/@href').extract()
item['category'] = q.select("div/div[@class='categories']/div/ul/li/text()").extract()
items.append(item)
for item in items:
yield item
#items.py
from scrapy.item import Item, Field
class InfobelItem(Item):
# define the fields for your item here like:
name = Field()
address = Field()
phone = Field()
email = Field()
category = Field()
website = Field()
#middlewares.py
import base64
import random
from settings import PROXIES
class ProxyMiddleware(object):
def process_request(self, request, spider):
proxy = random.choice(PROXIES)
if proxy['user_pass'] is not None:
request.meta['proxy'] = "http://%s" % proxy['ip_port']
encoded_user_pass = base64.encodestring(proxy['user_pass'])
request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
else:
request.meta['proxy'] = "http://%s" % proxy['ip_port']
#pipelines.py
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import TakeFirst, MapCompose, Join
import re
import json
import csv
class InfobelPipeline(object):
def __init__(self):
self.file = csv.writer(open('items.csv','wb'))
def process_item(self, item, spider):
name = item['name']
address = item['address']
phone = item['phone']
email = item['email']
category = item['category']
website = item['website']
self.file.writerow((name,address,phone,email,category,website))
return item
#settings.py
BOT_NAME = 'infobel'
BOT_VERSION = '1.0'
SPIDER_MODULES = ['infobel.spiders']
NEWSPIDER_MODULE = 'infobel.spiders'
DEFAULT_ITEM_CLASS = 'infobel.items.InfobelItem'
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
ITEM_PIPELINES = ['infobel.pipelines.InfobelPipeline']
PROXIES = [{'ip_port': '41.43.31.226:8080', 'user_pass': ''},
{'ip_port': '64.120.226.94:8080', 'user_pass': ''},
{'ip_port': '196.2.73.246:3128', 'user_pass': ''},]
DOWNLOADER_MIDDLEWARES = {
'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 110,
'infobel.middlewares.ProxyMiddleware': 100,
}
出力は次のとおりです。
[infobel] INFO: Passed InfobelItem(website=[u'track.aspx?id=0&url=http://www.bbmodena.it'], category=[u'TELEVISION, VIDEO AND HI-FI EMERGENCY BREAKDOWN SERVICES, REPAIRS AND SPARE PARTS'], name=[u'B & B (S.R.L.) (RIP.TVC VIDEO HI-FI)'], phone=[u'059254545'], address=[u'V. MALAVOLTI\xa047', u'41100', u'MODENA'], email=[u'info@bbmodena.it'])
[infobel] DEBUG: Scraped InfobelItem(website=[u'track.aspx?id=0&url=http://sitoinlavorazione.seat.it/boninispa'], category=[u'AUTOMOBILE AGENTS, DEALERS AND DEALERSHIPS'], name=[u'BONINI (S.P.A.) (CONCESSIONARIA RENAULT)'], phone=[u'035310333'], address=[u'V. S. BERNARDINO\xa0151', u'24126', u'BERGAMO'], email=[u'info@boniniautospa.it']) in <http://www.infobel.com/en/italy/business/20300/accessories>
[infobel] INFO: Passed InfobelItem(website=[u'track.aspx?id=0&url=http://sitoinlavorazione.seat.it/boninispa'], category=[u'AUTOMOBILE AGENTS, DEALERS AND DEALERSHIPS'], name=[u'BONINI (S.P.A.) (CONCESSIONARIA RENAULT)'], phone=[u'035310333'], address=[u'V. S. BERNARDINO\xa0151', u'24126', u'BERGAMO'], email=[u'info@boniniautospa.it'])
[infobel] DEBUG: Retrying <GET http://www.infobel.com/en/italy/business/20300/accessories/10> (failed 1 times): 200 OK
[infobel] DEBUG: Retrying <GET http://www.infobel.com/en/italy/business/20300/accessories/10> (failed 2 times): 200 OK
[infobel] DEBUG: Discarding <GET http://www.infobel.com/en/italy/business/20300/accessories/10> (failed 3 times): User timeout caused connection failure.
[infobel] ERROR: Error downloading <http://www.infobel.com/en/italy/business/20300/accessories/10>: [Failure instance: Traceback (failure with no frames): <class 'twisted.internet.error.TimeoutError'>: User timeout caused connection failure.
[infobel] INFO: Closing spider (finished)
[infobel] INFO: Spider closed (finished)