solr - SOLR を使用した NCrawler

Question

NCrawler を使用して Web サイトをクロールすることができました。そのデータを SOLR にインポートして、SOLR でインデックス付きデータを検索に使用できるようにすることはできますか?

可能であれば、クロールされたデータを SOLR にプッシュするにはどうすればよいですか? どんな助けでも本当に感謝しています。

前もって感謝します。

score 4 · Accepted Answer

はい、クロールされたデータを Solr にインデックス化することは可能です。私は以前にこれをしました。IPipelineStep を実装するカスタムパイプラインステップを作成し、それを NCrawler 実装に追加する必要があります。Solr に接続するためのクライアントとしてSolrNetを使用しました。

以下に、作業を開始するのに役立つコードをいくつか示します。

 SolrNet.Startup.Init<IndexItem>("http://localhost:8983/solr");

 using(Crawler c = new Crawler("http://ncrawler.codeplex.com/", 
    new HtmlDocumentProcessor(), new AddCrawledItemToSolrIndex()))
 {
     c.ThreadCount = 3;
     c.MaxCrawlDepth = 2;
     c.ExcludeFilter = new[] { new RegexFilter(
        new Regex(@"(\.jpg|\.css|\.js|\.gif|\.jpeg|\.png|\.ico)",
            RegexOptions.Compiled | RegexOptions.CultureInvariant | RegexOptions.IgnoreCase)) },
     c.Crawl();
 }

カスタム IPipelineStep

using System;
using System.Collections.ObjectModel;
using Microsoft.Practices.ServiceLocation;
using MyCrawler.Index;
using NCrawler;
using NCrawler.Interfaces;
using SolrNet;

namespace MyCrawler.Crawler
{
    public class AddCrawledItemToSolrIndex : IPipelineStep
    {
        public void Process(NCrawler.Crawler crawler, PropertyBag propertyBag)
        {
            if (string.IsNullOrWhiteSpace(propertyBag.Text))
                return;

            var indexItem = new IndexItem
            {
                Id = propertyBag.Step.Uri.ToString(),
                Url = propertyBag.Step.Uri.ToString(),
                Host = propertyBag.Step.Uri.Host,
                Content = propertyBag.Text,
                Title = propertyBag.Title,
                LastModified = Convert.ToInt64(DateTimeToUnixTimestamp(propertyBag.LastModified)),
                Date = propertyBag.LastModified.ToString("yyyyMMdd"),
                Keywords = ExtractKeywords(propertyBag.Headers),
                Type = SplitString(propertyBag.ContentType, ';'),
                Digest = CreateMD5Hash(propertyBag.Text),
            };
            var solr = ServiceLocator.Current.GetInstance<ISolrOperations<IndexItem>>();
            solr.Add(indexItem, new AddParameters {CommitWithin = 10000});
        }

        private Collection<string> SplitString(string input, char splitOn)
        {
            var values = input.Split(splitOn);
            var valueCollection = new Collection<string>();
            if (values.Length == 0) return valueCollection;
            foreach (var value in values)
            {
                valueCollection.Add(value.Trim());
            }

            return valueCollection;

        } 

        private double DateTimeToUnixTimestamp(DateTime dateTime)
        {
            return (dateTime - new DateTime(1970, 1, 1).ToLocalTime()).TotalSeconds;
        }

        private string CreateMD5Hash(string input)
        {
            // Use input string to calculate MD5 hash
            var md5 = MD5.Create();
            var inputBytes = Encoding.ASCII.GetBytes(input);
            var hashBytes = md5.ComputeHash(inputBytes);

            // Convert the byte array to hexadecimal string
            var sb = new StringBuilder();
            for (int i = 0; i < hashBytes.Length; i++)
            {
                //sb.Append(hashBytes[i].ToString("X2"));
                // To force the hex string to lower-case letters instead of
                // upper-case, use he following line instead:
                sb.Append(hashBytes[i].ToString("x2")); 
            }
            return sb.ToString();
        }


        private Collection<string> ExtractKeywords(System.Net.WebHeaderCollection headers)
        {
            var keywords = headers["keywords"];
            if (string.IsNullOrWhiteSpace(keywords))
            {
                return new Collection<string>();
            }

            return SplitString(keywords, ',');
        }
    }
}

これは、Solr インデックスフィールドへのマッピングに次の IndexItem.cs クラスを使用しています。

using System.Collections.ObjectModel;
using SolrNet.Attributes;

namespace MyCrawler.Index
{
    public class IndexItem
    {
        [SolrField("id")]
        public string Id { get; set; }
        [SolrField("url")]
        public string Url { get; set; }
        [SolrField("host")]
        public string Host { get; set; }
        [SolrField("content")]
        public string Content { get; set; }
        [SolrField("title")]
        public string Title { get; set; }
        [SolrField("description")]
        public string Description { get; set; }
        [SolrField("digest")]
        public string Digest { get; set; }
        [SolrField("keywords")]
        public Collection<string> Keywords { get; set; }
        [SolrField("date")]
        public string Date { get; set; }
        [SolrField("contentLength")]
        public long ContentLength { get; set; }
        [SolrField("lastModified")]
        public long LastModified { get; set; }
        [SolrField("type")]
        public Collection<string> Type { get; set; }
    }
}

Nutch コードベースから取得した Solr フィールド定義 (schema.xml)。

  <!-- core fields -->
  <field name="segment" type="string" stored="true" indexed="false"/>
  <field name="digest" type="string" stored="true" indexed="false"/>
  <field name="boost" type="float" stored="true" indexed="false"/>

  <!-- meta-tag fields -->
  <field name="keywords" type="text_general" stored="true" indexed="true" multiValued="true"/>
  <field name="description" type="text_general" stored="true" indexed="true"/>

  <!-- fields for index-basic plugin -->
  <field name="host" type="url" stored="false" indexed="true"/>
  <field name="site" type="string" stored="true" indexed="true"/>
  <field name="url" type="url" stored="true" indexed="true"
      required="true"/>
  <field name="content" type="text_general" stored="true" indexed="true"/>
  <field name="title" type="text_general" stored="true" indexed="true"/>
  <field name="cache" type="string" stored="true" indexed="false"/>
  <field name="tstamp" type="long" stored="true" indexed="true"/>

  <!-- fields for index-anchor plugin -->
  <field name="anchor" type="string" stored="true" indexed="true"
      multiValued="true"/>

  <!-- fields for index-more plugin -->
  <field name="type" type="string" stored="true" indexed="true"
      multiValued="true"/>
  <field name="contentLength" type="long" stored="true"
      indexed="false"/>
  <field name="lastModified" type="long" stored="true"
      indexed="true"/>
  <field name="date" type="string" stored="true" indexed="true"/>

  <!-- fields for languageidentifier plugin -->
  <field name="lang" type="string" stored="true" indexed="true"/>

  <!-- fields for subcollection plugin -->
  <field name="subcollection" type="string" stored="true"
      indexed="true"/>

  <!-- fields for feed plugin -->
  <field name="author" type="string" stored="true" indexed="true"/>
  <field name="tag" type="string" stored="true" indexed="true"/>
  <field name="feed" type="string" stored="true" indexed="true"/>
  <field name="publishedDate" type="string" stored="true"
      indexed="true"/>
  <field name="updatedDate" type="string" stored="true"
      indexed="true"/>

  <!-- catchall field, containing all other searchable text fields (implemented
  via copyField further on in this schema  -->
  <field name="text" type="text_general" indexed="true" stored="false" multiValued="true"/>

  <field name="_version_" type="long" indexed="true" stored="true"/>

  <field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
</fields>

明らかに、ニーズに合わせてこれを変更することをお勧めします。これにより、パフォーマンスが向上する可能性があります。でも参考になるはず。

solr - SOLR を使用した NCrawler

1 に答える 1

Related

Reference