0

私はウェブサイトのすべてのページをクロールしますが、現在この問題があります。

ページにクラス「td-cellalign-rightgray」と「td-cellalign-rightgrayrow-border」が含まれている場合は、両方のtext()をitem['price']に書き込みます。
ただし、ページに「td-cell align-right grey row-border」しか含まれていない場合は、item ['price']にtext()のみを記述してください。


コード:

from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request
from Test01.items import Test01Item
from scrapy.utils.url import urljoin_rfc
from scrapy.utils.response import get_base_url
import urlparse



    class ScrapyOrgSpider(BaseSpider):
        name = "oeticket"
        allowed_domains = ["oeticket.com"]
        start_urls = ["http://www.oeticket.com/de/suche/?search_string=amaretto"]

        def parse(self, response):
            hxs = HtmlXPathSelector(response)
            items = []


            next_page = hxs.select("//li[@class='next-page navigation']/a/@href").extract()
            abs_page = []
            for g in next_page:
                abs_page.append("http://oeticket.com" + g )

            if not not abs_page:
                for e in abs_page:
                    yield Request(e, self.parse)

            next_event = hxs.select("//li[@class='event-item vevent']/a/@href").extract()
            abs_event = []
            for it in next_event:
                abs_event.append("http://oeticket.com" + it) 


            if not not abs_event:
                for s in abs_event:
                    yield Request(s, self.parse)

            deeper = hxs.select("//li[@class='performance-item vevent']/a/@href").extract()
            abs_deeper = []
            for c in deeper:
               abs_deeper.append("http://oeticket.com" + c)

            if not not abs_deeper:
                for d in abs_deeper: 
                    yield Request(d, self.parse)

            posts = hxs.select("//ul[@class='grid_10 horizontal-list clearfix']")
            preis = hxs.select("//tbody/tr")


            for post in posts:
                item = Test01Item()

                item["when"] = post.select("li[@class='when']/p/abbr/text()").extract() + post.select("li[@class='when']/h2/text()").extract()
                items.append(item)

            for post in posts:
                item = Test01Item()
                item["what"] = post.select("li[@class='what']/h2/text()").extract()
                items.append(item)

            for post in posts:
                item = Test01Item()
                item["where"] = post.select("li[@class='where']/h2/text()").extract()
                items.append(item)



            for prei in preis:
                item = Test01Item()
                item['url'] = response.url
                item['price'] = prei.select("td[@class='ticket_price td-cell ucase black strong align-right']/text()").extract()
                item['price'] = prei.select("td[@class='ticket_price td-cell ucase black strong align-right row-border']/text()").extract()
                item["func"] = prei.select("td[@class='td-cell align-right gray']/text()").extract()
                item["func"] = prei.select("td[@class='td-cell align-right gray row-border']/text()").extract()

                items.append(item)

            for item in items:
                yield item


結果:

{"when": ["Donnerstag,  7. Feb 2013 ", "20:00"]},
{"what": ["Amaretto"]},
{"where": ["kleines theater"]},
{"url": "http://www.oeticket.com/de/tickets/amaretto-salzburg-kleines-theater-482435/performance.html", "price": [], "func": []},
{"url": "http://www.oeticket.com/de/tickets/amaretto-salzburg-kleines-theater-482435/performance.html", "price": ["  15,90 EUR  "], "func": ["  Erm\u00e4\u00dfigung lt. Info - ACHTUNG: Ausweiskontrolle!  "]},


期待される結果:

{"when": ["Donnerstag,  7. Feb 2013 ", "20:00"]},
{"what": ["Amaretto"]},
{"where": ["kleines theater"]},
{"url": "http://www.oeticket.com/de/tickets/amaretto-salzburg-kleines-theater-482435/performance.html", "price": ["  22,50 EUR  "], "func": ["  Normalpreis  "},
{"url": "http://www.oeticket.com/de/tickets/amaretto-salzburg-kleines-theater-482435/performance.html", "price": ["  15,90 EUR  "], "func": ["  Erm\u00e4\u00dfigung lt. Info - ACHTUNG: Ausweiskontrolle!  "]},

空白のItem-Fieldsを使用して、この問題を修正するにはどうすればよいですか?ありがとう!

4

1 に答える 1

1

リストの長さが0の場合は、リスト項目を確認する必要があります

item['price'] = prei.select("td[@class='ticket_price td-cell ucase black strong align-right']/text()").extract()
if len(item['price']) == 0:
   item['price'] = prei.select("td[@class='ticket_price td-cell ucase black strong align-right row-border']/text()").extract()
于 2013-01-14T12:45:55.050 に答える