Highlighter は Explainer よりも優れており、高速です。タグを強調表示した後、タグ間で一致するフレーズを抽出できます。
タグ間のテキストを抽出する Java 正規表現
public class HighlightDemo {
Directory directory;
Analyzer analyzer;
String[] contents = {"running in the park",
"I was jogging in the park this morning",
"running on the road",
"The famous New York Marathon has its final miles in Central park every year and it's easy to understand why: the park, with a variety of terrain and excellent scenery, is the ultimate runner's dream. With its many paths that range in level of difficulty, Central Park allows a runner to experience clarity and freedom in this picturesque urban oasis."};
@Before
public void setUp() throws IOException {
directory = new RAMDirectory();
analyzer = new WhitespaceAnalyzer();
// indexed documents
IndexWriter writer = new IndexWriter(directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED);
for (int i = 0; i < contents.length; i++) {
Document doc = new Document();
doc.add(new Field("content", contents[i], Field.Store.NO, Field.Index.ANALYZED)); // store & index
doc.add(new NumericField("id", Field.Store.YES, true).setIntValue(i)); // store & index
writer.addDocument(doc);
}
writer.close();
}
@Test
public void test() throws IOException, ParseException, InvalidTokenOffsetsException {
IndexSearcher s = new IndexSearcher(directory);
QueryParser parser = new QueryParser(Version.LUCENE_36, "content", analyzer);
org.apache.lucene.search.Query query = parser.parse("park");
TopDocs hits = s.search(query, 10);
SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
for (int i = 0; i < hits.scoreDocs.length; i++) {
int id = hits.scoreDocs[i].doc;
Document doc = s.doc(id);
String text = contents[Integer.parseInt(s.doc(id).get("id"))];
TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(text));
org.apache.lucene.search.highlight.TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 10);
for (int j = 0; j < frag.length; j++) {
if ((frag[j] != null) && (frag[j].getScore() > 0)) {
assertTrue(frag[j].toString().contains("<B>"));
assertTrue(frag[j].toString().contains("</B>"));
System.out.println(frag[j].toString());
}
}
}
}
}