URLのパターンは
http://www.khmer24.com/ad/change-petrol-to-gas-use-injector-special-price/67-204320.html
ドメイン、広告、番号を残したいURLの67。サンプル URL は次のとおりです:
http://www.khmer24.com/ad/ANY-STRING/67-123456789.html
ここに私のスパイダーのコードがあります:
from scrapy.item import Item, Field
class Khmer24(Item):
title = Field()
price = Field()
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
class MySpider(CrawlSpider):
name = "khmer24"
allowed_domains = ["www.khmer24.com"]
start_urls = ["http://www.khmer24.com/"]
#HERE IS WHERE I GET STUCK
rules = (Rule (SgmlLinkExtractor(allow=("index/ad\d\s\67-\d00.html", ),restrict_xpaths=('//p[@class="nextpage"]',))
, callback="parse_items", follow= True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//div[@class='innerbox']")
items = []
for title in titles:
item = Khmer24()
item["title"] = title.select("h1/text()").extract()
item["price"] = title.select("table/tr/td/p[@class='description']/span[@class='price']/strong/text()").extract()
items.append(item)
return(items)