python - 正規表現で再帰的なスクレイピールールを書く方法は?

Question

URLのパターンは
http://www.khmer24.com/ad/change-petrol-to-gas-use-injector-special-price/67-204320.html
ドメイン、広告、番号を残したいURLの67。サンプル URL は次のとおりです:
http://www.khmer24.com/ad/ANY-STRING/67-123456789.html

ここに私のスパイダーのコードがあります:

from scrapy.item import Item, Field

class Khmer24(Item):
    title = Field()
    price = Field()

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector


class MySpider(CrawlSpider):
    name = "khmer24"
    allowed_domains = ["www.khmer24.com"]
    start_urls = ["http://www.khmer24.com/"]   
    #HERE IS WHERE I GET STUCK
    rules = (Rule (SgmlLinkExtractor(allow=("index/ad\d\s\67-\d00.html", ),restrict_xpaths=('//p[@class="nextpage"]',))
    , callback="parse_items", follow= True),
    )

    def parse_items(self, response):
        hxs = HtmlXPathSelector(response)
        titles = hxs.select("//div[@class='innerbox']")
        items = []
        for title in titles:
            item = Khmer24()
            item["title"] = title.select("h1/text()").extract()
            item["price"] = title.select("table/tr/td/p[@class='description']/span[@class='price']/strong/text()").extract()
            items.append(item)
        return(items)

score 2 · Accepted Answer

リンクエクストラクタの allow XPath のみを探しているようです。これを試してください：

/ad/[^/]+/67-\d+\.html

また、ホームページは次のように表示される場合があります。

>>> le = SgmlLinkExtractor(allow=r'/ad/[^/]+/67-\d+\.html')
>>> le.extract_links(response)
[Link(url='http://www.khmer24.com/ad/change-petrol-to-gas-use-injector-special-price/67-204320.html', text=u'', fragment='', nofollow=False),
 Link(url='http://www.khmer24.com/ad/i-want-to-sell-my-car-toyota-corolla-s-2003/67-253891.html', text=u'', fragment='', nofollow=False),
 Link(url='http://www.khmer24.com/ad/corolla-altis-2002/67-242425.html', text=u'Corolla Altis 2002', fragment='', nofollow=False),
 Link(url='http://www.khmer24.com/ad/nissan-crew-1997/67-256846.html', text=u'Nissan crew 1997', fragment='', nofollow=False),
 Link(url='http://www.khmer24.com/ad/white-nissan-march-2002/67-198118.html', text=u'White Nissan March 2002', fragment='', nofollow=False),
 Link(url='http://www.khmer24.com/ad/mercedes-s430-black-year-2000-phnom-penh/67-257711.html', text=u'Mercedes S430, Black Year 2000, Phnom Penh', fragment='', nofollow=False),
 Link(url='http://www.khmer24.com/ad/car-for-sale-or-exchangeprado-2007/67-233230.html', text=u'Car for sale or exchange(PRADO 2007)', fragment='', nofollow=False),
 Link(url='http://www.khmer24.com/ad/urgent-toyota-hybrid-pruis-2001-abs-brake/67-164632.html', text=u'URGENT Toyota Hybrid PRUIS 2001 . ABS brake', fragment='', nofollow=False),
 Link(url='http://www.khmer24.com/ad/camry-97-xle-for-sale/67-254704.html', text=u'Camry 97-XLE For Sale', fragment='', nofollow=False),
 Link(url='http://www.khmer24.com/ad/honda-civic-98-silver-for-sale/67-193666.html', text=u'Honda Civic 98 Silver For Sale', fragment='', nofollow=False)]

python - 正規表現で再帰的なスクレイピールールを書く方法は?

1 に答える 1

Related

Reference