次のような Scrapy 出力があります。
[{'gender': 'women',
'name': 'NEW IN: CLOTHING',
'products': [{'name': 'Free People Cocoon Multi Way Neck Top',
'price': {'currency': 'GBP',
'outlet': '40.0',
'retail': '58.0'}}]},
{'gender': 'women',
'name': 'NEW IN: CLOTHING',
'products': [{'name': 'Free People Cocoon Multi Way Neck Top',
'price': {'currency': 'GBP',
'outlet': '40.0',
'retail': '58.0'}},
{'name': 'N12H Joshua Tree Dress',
'price': {'currency': 'GBP',
'outlet': '140.0',
'retail': '249.0'}}]},
{'gender': 'women',
'name': 'NEW IN: CLOTHING',
'products': [{'name': 'Free People Cocoon Multi Way Neck Top',
'price': {'currency': 'GBP',
'outlet': '40.0',
'retail': '58.0'}},
{'name': 'N12H Joshua Tree Dress',
'price': {'currency': 'GBP',
'outlet': '140.0',
'retail': '249.0'}},
{'name': 'Twiin Method Rib Mesh Flare Sleeve Top',
'price': {'currency': 'GBP',
'outlet': '22.0',
'retail': '32.0'}}]},
{'gender': 'women',
'name': 'NEW IN: CLOTHING',
'products': [{'name': 'Free People Cocoon Multi Way Neck Top',
'price': {'currency': 'GBP',
'outlet': '40.0',
'retail': '58.0'}},
{'name': 'N12H Joshua Tree Dress',
'price': {'currency': 'GBP',
'outlet': '140.0',
'retail': '249.0'}},
{'name': 'Twiin Method Rib Mesh Flare Sleeve Top',
'price': {'currency': 'GBP',
'outlet': '22.0',
'retail': '32.0'}},
{'name': 'Twiin End Game Varsity Denim Trucker Jacket',
'price': {'currency': 'GBP',
'outlet': '45.0',
'retail': '80.0'}}]},
{'gender': 'women',
'name': 'NEW IN: SHOES & ACCESSORIES ',
'products': [{'name': 'Melissa Ultragirl Triple Bow Ballerina',
'price': {'currency': 'GBP',
'outlet': '48.0',
'retail': '68.0'}}]},
{'gender': 'women',
'name': 'NEW IN: SHOES & ACCESSORIES ',
'products': [{'name': 'Melissa Ultragirl Triple Bow Ballerina',
'price': {'currency': 'GBP',
'outlet': '48.0',
'retail': '68.0'}},
{'name': 'Zaxy Tbar Flip Flops',
'price': {'currency': 'GBP',
'outlet': '20.0',
'retail': '26.0'}}]},
{'gender': 'women',
'name': 'NEW IN: SHOES & ACCESSORIES ',
'products': [{'name': 'Melissa Ultragirl Triple Bow Ballerina',
'price': {'currency': 'GBP',
'outlet': '48.0',
'retail': '68.0'}},
{'name': 'Zaxy Tbar Flip Flops',
'price': {'currency': 'GBP',
'outlet': '20.0',
'retail': '26.0'}},
{'name': 'Estella Bartlet Silver Plated Heart Bracelet Duo Set',
'price': {'currency': 'GBP',
'outlet': '15.0',
'retail': '31.0'}}]},
{'gender': 'women',
'name': 'NEW IN: SHOES & ACCESSORIES ',
'products': [{'name': 'Melissa Ultragirl Triple Bow Ballerina',
'price': {'currency': 'GBP',
'outlet': '48.0',
'retail': '68.0'}},
{'name': 'Zaxy Tbar Flip Flops',
'price': {'currency': 'GBP',
'outlet': '20.0',
'retail': '26.0'}},
{'name': 'Estella Bartlet Silver Plated Heart Bracelet Duo Set',
'price': {'currency': 'GBP',
'outlet': '15.0',
'retail': '31.0'}},
{'name': 'Ashiana Embroidered Large Toiletry Bag With Wateproof '
'Lining',
'price': {'currency': 'GBP',
'outlet': '25.0',
'retail': '35.0'}}]}]
これは、すべての製品処理で Loader.load_item() を使用しているためです。
次のように、パイプラインまたは出力プロセッサを構築して、最後に処理されたアイテムのみを返すようにするにはどうすればよいですか?
[{'gender': 'women',
'name': 'NEW IN: CLOTHING',
'products': [{'name': 'Free People Cocoon Multi Way Neck Top',
'price': {'currency': 'GBP',
'outlet': '40.0',
'retail': '58.0'}},
{'name': 'N12H Joshua Tree Dress',
'price': {'currency': 'GBP',
'outlet': '140.0',
'retail': '249.0'}},
{'name': 'Twiin Method Rib Mesh Flare Sleeve Top',
'price': {'currency': 'GBP',
'outlet': '22.0',
'retail': '32.0'}},
{'name': 'Twiin End Game Varsity Denim Trucker Jacket',
'price': {'currency': 'GBP',
'outlet': '45.0',
'retail': '80.0'}}]},
{'gender': 'women',
'name': 'NEW IN: SHOES & ACCESSORIES ',
'products': [{'name': 'Melissa Ultragirl Triple Bow Ballerina',
'price': {'currency': 'GBP',
'outlet': '48.0',
'retail': '68.0'}},
{'name': 'Zaxy Tbar Flip Flops',
'price': {'currency': 'GBP',
'outlet': '20.0',
'retail': '26.0'}},
{'name': 'Estella Bartlet Silver Plated Heart Bracelet Duo Set',
'price': {'currency': 'GBP',
'outlet': '15.0',
'retail': '31.0'}},
{'name': 'Ashiana Embroidered Large Toiletry Bag With Wateproof '
'Lining',
'price': {'currency': 'GBP',
'outlet': '25.0',
'retail': '35.0'}}]}]
処理された最後の行には、そのセッションのすべての製品が含まれています。スパイダーが閉じるときに処理を試みましたが、成功しませんでした。
私はこのプロジェクトをほぼ終えようとしており、多くのことを調査し、多くのことを試しましたが、フィールドでのアイテムのスタックに関連するものはありませんでした。
私のアイテムコード:
from scrapy.item import Item, Field
from scrapy.loader.processors import TakeFirst, Join, Compose, MapCompose
class Session(Item):
name = Field()
gender = Field()
products = Field(
# no idea what to put... tryed Join, Compose and MapCompose
)
class Product(Item):
name = Field()
price = Field()
class Price(Item):
outlet = Field()
retail = Field()
currency = Field()
私のスパイダーコード:
def parse(self, response):
sessions = response.css("article.feature:nth-of-type(-n+2)")
for session in sessions:
sessionlink = session.css("a.feature__link::attr(href)").extract_first()
lsession = ItemLoader(item=Session(), response=response)
lsession.add_value("name", session.css("div.feature__title h3::text").extract_first())
lsession.add_value("gender", re.split("[/]+", response.request.url)[2])
requestsession = response.follow(sessionlink, callback=self.parse_session)
requestsession.meta["lsession"] = lsession
requestsession.meta["pages"] = 1
yield requestsession
def parse_session(self, response):
lsession = response.meta["lsession"]
pages = response.meta["pages"]
products = response.css("li.product-container:nth-of-type(-n+2)")
for product in products:
productlink = product.css("a.product-link::attr(href)").extract_first()
requestproduct = response.follow(productlink, callback=self.parse_product)
requestproduct.meta["lsession"] = lsession
requestproduct.meta["productlink"] = productlink
yield requestproduct
nextpage = response.css("ul.pager li.next a::attr(href)").extract_first()
if pages < 2:
pages += 1
requestnewpage = response.follow(nextpage, callback=self.parse_session)
requestnewpage.meta["lsession"] = lsession
requestnewpage.meta["pages"] = pages
yield requestnewpage
def parse_product(self, response):
lsession = response.meta["lsession"]
productlink = response.meta["productlink"]
lproduct = ItemLoader(item=Product(), response=response)
name = response.css("div.product-hero>h1::text").extract_first()
lproduct.replace_value("name", str(name))
pricelink = "AN AJAX LINK TO GET THE PRICE"
requestprice = response.follow(pricelink, callback=self.parse_price)
requestprice.meta["lsession"] = lsession
requestprice.meta["lproduct"] = lproduct
yield requestprice
def parse_price(self, response):
lsession = response.meta["lsession"]
lproduct = response.meta["lproduct"]
lprice = ItemLoader(item=Price(), response=response)
pricejson = json.loads(response.body)
outletprice = pricejson[0]["productPrice"]["current"]["value"]
retailprice = pricejson[0]["productPrice"]["rrp"]["value"]
currency = pricejson[0]["productPrice"]["currency"]
lprice.replace_value("outlet", str(outletprice))
lprice.replace_value("retail", str(retailprice))
lprice.replace_value("currency", str(currency))
lproduct.replace_value("price", lprice.load_item())
lsession.add_value("products", dict(lproduct.load_item()))
yield lsession.load_item()