0

ご挨拶の友人、

要点にまっすぐ。Mysql DB に多くの BLOB を保存しました。これらは主に PDF (80%) と .doc です。DBにもテキストがあります。今までインデックスを作成してテキストをクエリできましたが、BLOB にインデックスを付けることができませんでした。単一のコレクション (ドキュメント) を作成しようとしていますが、うまくいきません。そのようなことを行う方法に関するレシピはありますか?

data-config.xml の一部:

<?xml version="1.0" encoding="utf-8"?>

<dataConfig>

<dataSource type="JdbcDataSource"
  autoCommit="true" batchSize="-1"
  convertType="false"
  driver="com.mysql.jdbc.Driver"
  url="jdbc:mysql://127.0.0.1:3306/ktimatologio"
  user="root"
  password="********"
  name="db"/>

<dataSource name="fieldReader" type="FieldStreamDataSource" />  

  <document> 

    <entity name="aitiologikes_ektheseis"
    dataSource="db"
    transformer="HTMLStripTransformer"
    query="select id, title, title AS grid_title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, CONCAT( body,' ',title)  AS content from aitiologikes_ektheseis where type = 'text'"
    deltaImportQuery="select id, title, title AS grid_title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, CONCAT( body,' ',title)  AS content from aitiologikes_ektheseis where type = 'text' and id='${dataimporter.delta.id}'"
    deltaQuery="select id, title, title AS grid_title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, CONCAT( body,' ',title)  AS content from aitiologikes_ektheseis where type = 'text' and last_modified &gt; '${dataimporter.last_index_time}'">
      <field column="id" name="ida" />               
      <field column="solr_id" name="solr_id" />
      <field column="title" name="title" stripHTML="true" />
      <field column="grid_title" name="grid_title" stripHTML="true" />
      <field column="model" name="model" stripHTML="true" />
      <field column="type" name="type" stripHTML="true" />
      <field column="url" name="url" stripHTML="true" />
      <field column="last_modified" name="last_modified" stripHTML="true"  />
      <field column="search_tag" name="search_tag" stripHTML="true" />
      <field column="content" name="content" stripHTML="true" />
    </entity>

    <entity name="aitiologikes_ektheseis_bin"
    query="select id, title, title AS grid_title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, bin_con AS text from aitiologikes_ektheseis where type = 'bin'"
    deltaImportQuery="select id, title, title AS grid_title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, bin_con AS text from aitiologikes_ektheseis where type = 'bin' and id='${dataimporter.delta.id}'"
    deltaQuery="select id, title, title AS grid_title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, bin_con AS text from aitiologikes_ektheseis where type = 'bin' and last_modified &gt; '${dataimporter.last_index_time}'"
    transformer="TemplateTransformer"
    dataSource="db">         
      <field column="id" name="ida" />               
      <field column="solr_id" name="solr_id" />
      <field column="title" name="title" stripHTML="true" />
      <field column="grid_title" name="grid_title" stripHTML="true" />
      <field column="model" name="model" stripHTML="true" />
      <field column="type" name="type" stripHTML="true" />
      <field column="url" name="url" stripHTML="true" />
      <field column="last_modified" name="last_modified" stripHTML="true"  />
      <field column="search_tag" name="search_tag" stripHTML="true" />

      <entity dataSource="fieldReader" processor="TikaEntityProcessor" dataField="aitiologikes_ektheseis_bin.text" format="text"> 
          <field column="text" name="contentbin" stripHTML="true" />
      </entity>

    </entity>

       ...
       ...
  </document> 

</dataConfig>

schema.xml の一部 (fieldTypes とフィールド定義):

<fieldType name="text_ktimatologio" class="solr.TextField" positionIncrementGap="100">

<analyzer type="index">               
  <tokenizer class="solr.StandardTokenizerFactory"/>
  <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt" enablePositionIncrements="true"/>
  <filter class="solr.LowerCaseFilterFactory"/>
  <filter class="solr.EnglishPossessiveFilterFactory"/>               
  <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>
  <filter class="solr.GreekLowerCaseFilterFactory"/>
  <filter class="solr.GreekStemFilterFactory"/>       
  <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
  <filter class="solr.PorterStemFilterFactory"/>
</analyzer> 

<analyzer type="query">
  <tokenizer class="solr.StandardTokenizerFactory"/>
  <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
  <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt" enablePositionIncrements="true"/>               
  <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>               
  <filter class="solr.GreekLowerCaseFilterFactory"/>
  <filter class="solr.GreekStemFilterFactory"/>
  <filter class="solr.LowerCaseFilterFactory"/>
  <filter class="solr.EnglishPossessiveFilterFactory"/>
  <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
  <filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldType>

<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
  <charFilter class="solr.HTMLStripCharFilterFactory"/>
  <tokenizer class="solr.StandardTokenizerFactory"/>
  <filter class="solr.StandardFilterFactory"/>   
  <filter class="solr.LowerCaseFilterFactory"/>                               
  <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>               
  <filter class="solr.GreekLowerCaseFilterFactory"/>
  <filter class="solr.GreekStemFilterFactory"/>
  <filter class="solr.HunspellStemFilterFactory" dictionary="dictionaries/el_GR.dic" affix="dictionaries/el_GR.aff" ignoreCase="true" />
</analyzer>

<analyzer type="query">
  <charFilter class="solr.HTMLStripCharFilterFactory"/>
  <tokenizer class="solr.StandardTokenizerFactory"/>
  <filter class="solr.StandardFilterFactory"/>
  <filter class="solr.LowerCaseFilterFactory"/>   
  <filter class="solr.LowerCaseFilterFactory"/>                               
  <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>               
  <filter class="solr.GreekLowerCaseFilterFactory"/>
  <filter class="solr.GreekStemFilterFactory"/>
  <filter class="solr.HunspellStemFilterFactory" dictionary="dictionaries/el_GR.dic" affix="dictionaries/el_GR.aff" ignoreCase="true" />
</analyzer>
</fieldType>

<fields>
  <field  name="ida" type="string" indexed="true" stored="true" multiValued="false"/>
  <field  name="solr_id" type="string" indexed="true" stored="true" multiValued="false"/> 
  <field  name="title" type="text_ktimatologio" indexed="true" stored="true"/>
  <field  name="grid_title" type="text_ktimatologio" indexed="true" stored="true"/>
  <field  name="model" type="string" indexed="true" stored="true" multiValued="false"/>
  <field  name="type" type="string" indexed="true" stored="true"/>
  <field  name="url" type="string" indexed="true" stored="true"/>
  <field  name="last_modified" type="string" indexed="true" stored="true"/>
  <field  name="search_tag" type="string" indexed="true" stored="true"/>
  <field  name="contentbin" type="text" indexed="true" stored="true" multiValued="true"/>
  <field  name="content" type="text_ktimatologio" indexed="true" stored="true" multiValued="true"/>     
</fields>

これについて本当に助けが必要です!

敬意を表して、

トム

ギリシャ

4

1 に答える 1

0

BLOB を「索引付け」しますか? 最終的に検索できるようにしたいということですか?あなたの質問を正しく理解しているかどうかわかりません。

まず、Solr でApache Tikaなどを使用して PDF または .doc を変換してから、Solr にインデックスを付けさせたいと思います。また、ユーザーに PDF またはドキュメントにアクセスしてもらいたい場合、最適な場所は DB であり、そこから取得できますか?

于 2012-08-23T21:36:07.757 に答える