-2

コードを実行しようとすると、この問題に直面します。このスクレイピングのリアルタイム リクエストを定義しましたが、まだ機能しません。Pythonでこの問題に対処する方法を知っている人はいますか? この場合、サイトマップはどのくらい重要ですか? 前もって感謝します

import logging
import re
from urllib.parse import urljoin, urlparse
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy import Request
from scrapy.spiders import SitemapSpider
from scrapy.selector import Selector
from scrapy.linkextractors import LinkExtractor
from scrapy.shell import inspect_response
from sqlalchemy.orm import sessionmaker
from content.spiders.templates.sitemap_template import ModSitemapSpider
from content.models import db_connect, create_db_table, Articles
from content.items import ContentItems
from content.item_functions import (process_item,
                                process_singular_item,
                                process_date_item,
                                process_array_item,
                                process_plural_texts,
                                process_external_links,
                                process_article_text)

HEADER_XPATH = ['//h1[@class="article-title"]//text()']
AUTHOR_XPATH = ['//span[@class="cnnbyline"]//text()',
            '//span[@class="byline"]//text()']
PUBDATE_XPATH = ['//span[@class="cnnDateStamp"]//text()']
TAGS_XPATH = ['']
CATEGORY_XPATH = ['']
TEXT = ['//div[@id="storytext"]//text()',
    '//div[@id="storycontent"]//p//text()']
INTERLINKS = ['//span[@class="inStoryHeading"]//a/@href']
DATE_FORMAT_STRING = '%Y-%m-%d'


class CNNnewsSpider(ModSitemapSpider):

    name = 'cnn'
    allowed_domains = ["cnn.com"]
    sitemap_urls = ["http://edition.cnn.com/sitemaps/sitemap-news.xml"]


def parse(self, response):
    items = []
    item = ContentItems()
    item['title'] = process_singular_item(self, response, HEADER_XPATH, single=True)
    item['resource'] = urlparse(response.url).hostname
    item['author'] = process_array_item(self, response, AUTHOR_XPATH, single=False)
    item['pubdate'] = process_date_item(self, response, PUBDATE_XPATH, DATE_FORMAT_STRING, single=True)
    item['tags'] = process_plural_texts(self, response, TAGS_XPATH, single=False)
    item['category'] = process_array_item(self, response, CATEGORY_XPATH, single=False)
    item['article_text'] = process_article_text(self, response, TEXT)
    item['external_links'] = process_external_links(self, response, INTERLINKS, single=False)
    item['link'] = response.url
    items.append(item)
    return items

これは私のテキストの結果です:

File "/home/nik/project/lib/python3.5/site-      packages/scrapy/spiders/__init__.py", line 76, in parse
raise NotImplementedError
NotImplementedError
2016-10-17 18:48:04 [scrapy] DEBUG: Redirecting (302) to <GET     http://edition.cnn.com/2016/10/15/opinions/the-black-panthers-heirs-after-50-     years-joseph/index.html> from <GET http://www.cnn.com/2016/10/15/opinions/the-     black-panthers-heirs-after-50-years-joseph/index.html>
2016-10-17 18:48:04 [scrapy] DEBUG: Redirecting (302) to <GET   http://edition.cnn.com/2016/10/15/africa/montreal-climate-change-hfc-  kigali/index.html> from <GET http://www.cnn.com/2016/10/15/africa/montreal-  climate-change-hfc-kigali/index.html>
2016-10-17 18:48:04 [scrapy] DEBUG: Redirecting (302) to <GET http://edition.cnn.com/2016/10/14/middleeast/battle-for-mosul-hawija-iraq/index.html> from <GET http://www.cnn.com/2016/10/14/middleeast/battle-for-mosul-hawija-iraq/index.html>
2016-10-17 18:48:04 [scrapy] ERROR: Spider error processing <GET    http://edition.cnn.com/2016/10/15/politics/donald-trump-hillary-clinton-drug-    test/index.html> (referer: http://edition.cnn.com/sitemaps/sitemap-news.xml)
Traceback (most recent call last):
File "/home/nik/project/lib/python3.5/site-   packages/twisted/internet/defer.py", line 587, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/home/nik/project/lib/python3.5/site-   packages/scrapy/spiders/__init__.py", line 76, in parse
raise NotImplementedError
4

1 に答える 1