xml ファイルをフォルダーから solr に一括インポートしようとしています。私のDIH構成は次のようになります。
<dataConfig>
<dataSource type="FileDataSource"/>
<document>
<!-- this outer processor generates a list of files satisfying the conditions
specified in the attributes -->
<entity name="xmlImport" processor="FileListEntityProcessor"
fileName=".*xml"
recursive="true"
rootEntity="false"
dataSource="null"
baseDir="/home/rsp/shellscript/output"
>
<!-- this processor extracts content using Xpath from each file found -->
<entity name="nested" processor="XPathEntityProcessor" transformer="DateFormatTransformer,TemplateTransformer"
forEach="/root" url="${xmlImport.fileAbsolutePath}">
<field column="id" xpath="/root/sub1/sub2/id"/>
</entity>
</entity>
</document>
</dataConfig>
フル インポートを実行しようとすると、ジョブが失敗し、以下のログが残ります。
Full Import failed:java.lang.RuntimeException: java.lang.RuntimeException: org.apache.solr.handler.dataimport.DataImportHandlerException: Parsing failed for xml, url:/home/rsp/shellscript/output/file1.xml rows processed:0 Processing Document # 1
at org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:271)
at org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:417)
at org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:481)
at org.apache.solr.handler.dataimport.DataImporter$1.run(DataImporter.java:462)
Caused by: java.lang.RuntimeException: org.apache.solr.handler.dataimport.DataImportHandlerException: Parsing failed for xml, url:/home/rsp/shellscript/output/file1.xml rows processed:0 Processing Document # 1
at org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:417)
at org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:330)
at org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:233)
... 3 more
Caused by: org.apache.solr.handler.dataimport.DataImportHandlerException: Parsing failed for xml, url:/home/rsp/shellscript/output/file1.xml rows processed:0 Processing Document # 1
at org.apache.solr.handler.dataimport.DataImportHandlerException.wrapAndThrow(DataImportHandlerException.java:70)
at org.apache.solr.handler.dataimport.XPathEntityProcessor.initQuery(XPathEntityProcessor.java:330)
at org.apache.solr.handler.dataimport.XPathEntityProcessor.fetchNextRow(XPathEntityProcessor.java:225)
at org.apache.solr.handler.dataimport.XPathEntityProcessor.nextRow(XPathEntityProcessor.java:205)
at org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:244)
at org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:476)
at org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:515)
at org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:415)
... 5 more
Caused by: java.lang.RuntimeException: com.ctc.wstx.exc.WstxParsingException: Undeclared general entity "ldquo"
at [row,col {unknown-source}]: [147,57]
at org.apache.solr.handler.dataimport.XPathRecordReader.streamRecords(XPathRecordReader.java:188)
at org.apache.solr.handler.dataimport.XPathEntityProcessor.initQuery(XPathEntityProcessor.java:319)
... 11 more
Caused by: com.ctc.wstx.exc.WstxParsingException: Undeclared general entity "ldquo"
at [row,col {unknown-source}]: [147,57]
at com.ctc.wstx.sr.StreamScanner.constructWfcException(StreamScanner.java:614)
at com.ctc.wstx.sr.StreamScanner.throwParseError(StreamScanner.java:487)
at com.ctc.wstx.sr.BasicStreamReader.handleUndeclaredEntity(BasicStreamReader.java:5470)
at com.ctc.wstx.sr.StreamScanner.expandUnresolvedEntity(StreamScanner.java:1742)
at com.ctc.wstx.sr.StreamScanner.expandEntity(StreamScanner.java:1626)
at com.ctc.wstx.sr.StreamScanner.fullyResolveEntity(StreamScanner.java:1564)
at com.ctc.wstx.sr.BasicStreamReader.skipTokenText(BasicStreamReader.java:3604)
at com.ctc.wstx.sr.BasicStreamReader.skipToken(BasicStreamReader.java:3369)
at com.ctc.wstx.sr.BasicStreamReader.nextFromTree(BasicStreamReader.java:2629)
at com.ctc.wstx.sr.BasicStreamReader.next(BasicStreamReader.java:1073)
at org.apache.solr.handler.dataimport.XPathRecordReader$Node.handleStartElement(XPathRecordReader.java:377)
at org.apache.solr.handler.dataimport.XPathRecordReader$Node.parse(XPathRecordReader.java:311)
at org.apache.solr.handler.dataimport.XPathRecordReader$Node.handleStartElement(XPathRecordReader.java:347)
at org.apache.solr.handler.dataimport.XPathRecordReader$Node.parse(XPathRecordReader.java:311)
at org.apache.solr.handler.dataimport.XPathRecordReader$Node.handleStartElement(XPathRecordReader.java:347)
at org.apache.solr.handler.dataimport.XPathRecordReader$Node.parse(XPathRecordReader.java:311)
at org.apache.solr.handler.dataimport.XPathRecordReader$Node.handleStartElement(XPathRecordReader.java:347)
at org.apache.solr.handler.dataimport.XPathRecordReader$Node.parse(XPathRecordReader.java:311)
at org.apache.solr.handler.dataimport.XPathRecordReader$Node.access$200(XPathRecordReader.java:203)
at org.apache.solr.handler.dataimport.XPathRecordReader.streamRecords(XPathRecordReader.java:185)
... 12 more
「xml で宣言されていない一般エンティティ 'ldquo'」。
&ldquo,® のような xhtml エンティティが xml 内にあり、インデックス作成には必要ありません。これを無視して、xml からフィールド値を取得するにはどうすればよいですか? Solr 5.4.1 インスタンスと Java バージョン「1.7.0_45」を使用しています。