file - ファイルの巨大な問題で lucene booleanquery を実行する

Question

ブールクエリを実行すると、巨大な nquad ファイル (約 4000 行) に問題が発生したため、次のようにクエリを実行します。

Query query1 = new TermQuery(new Term(FIELD_CONTENTS, "Albania"));
    Query query2 = new TermQuery(new Term(FIELD_CONTENTS, "Hitchcock"));

    BooleanQuery booleanQuery = new BooleanQuery();
    booleanQuery.add(query1, BooleanClause.Occur.MUST);
    booleanQuery.add(query2, BooleanClause.Occur.MUST);

このクエリは、行番号 <780 で検索しようとした単語が失敗した場合に正しく実行され、その後 >780 が失敗しました。

これは私の nquad ファイルのスニペットです:

<http://dbpedia.org/resource/A_Clockwork_Orange> <http://dbpedia.org/ontology/numberOfPages> "192"^^<http://www.w3.org/2001/XMLSchema#positiveInteger> <http://en.wikipedia.org/wiki/A_Clockwork_Orange?oldid=606117686#absolute-line=12> .

識別トークン用のカスタムアナライザーを作成します。

import java.io.Reader;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;

class TestAnalyzer1 extends Analyzer {
    public static final String[] TEST_STOP_WORDS = { "http", "https",
            "resource", "foaf/0.1", "dbpedia.org", "en.wikipedia.org",
            "xmlns.com", "purl.org", "elements/1.1",
            "www.w3.org/2001/XMLSchema", "www.w3.org/1999/02/22-rdf",
            "www.w3.org/2003/01", "oldid", "wiki" };

    @SuppressWarnings("rawtypes")
    private Set stopWords = StopFilter.makeStopSet(TEST_STOP_WORDS);

    public TokenStream tokenStream(String fieldName, Reader reader) {
        TokenStream ts = new StandardTokenizer(reader);
        ts = new StandardFilter(ts);
        ts = new StopFilter(ts, stopWords);
        return ts;
    }
}

これはメインクラスです:

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.Iterator;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Hit;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;

@SuppressWarnings("deprecation")
public class TestPreFinal {

    public static final String FILES_TO_INDEX_DIRECTORY = "filesToIndex_1";
    public static final String INDEX_DIRECTORY = "indexDirectory";

    public static final String FIELD_PATH = "path";
    public static final String FIELD_CONTENTS = "contents";

    public static void main(String[] args) throws CorruptIndexException,
            LockObtainFailedException, IOException, ParseException {

        long startTime = System.currentTimeMillis();

        Analyzer analyzer = new TestAnalyzer1();
        IndexWriter indexWriter = new IndexWriter(INDEX_DIRECTORY, analyzer,
                true);

        File dir = new File(FILES_TO_INDEX_DIRECTORY);
        File[] files = dir.listFiles();

        for (File file : files) {
            Reader reader = new FileReader(file);
            Document document = new Document();
            String path = file.getCanonicalPath();

            Field fieldPath = new Field(FIELD_PATH, path, Field.Store.YES,
                    Field.Index.UN_TOKENIZED);
            Field fieldContents = new Field(FIELD_CONTENTS, reader,
                    Field.TermVector.WITH_POSITIONS_OFFSETS);

            document.add(fieldPath);
            document.add(fieldContents);

            indexWriter.addDocument(document);
        }

        indexWriter.commit();
        indexWriter.close();

        Directory directory = FSDirectory.getDirectory(INDEX_DIRECTORY);
        IndexSearcher indexSearcher = new IndexSearcher(directory);
        IndexReader indexReader = IndexReader.open(directory);

        Query query1 = new TermQuery(new Term(FIELD_CONTENTS, "Albania"));
        Query query2 = new TermQuery(new Term(FIELD_CONTENTS, "Hitchcock"));

        BooleanQuery booleanQuery = new BooleanQuery();
        booleanQuery.add(query1, BooleanClause.Occur.MUST);
        booleanQuery.add(query2, BooleanClause.Occur.MUST);

        Hits hits = indexSearcher.search(booleanQuery);
        @SuppressWarnings({ "unchecked" })
        Iterator<Hit> it = hits.iterator();
        TermFreqVector tfv = null;

        while (it.hasNext()) {
            Hit hit = it.next();
            Document document = hit.getDocument();
            String path = document.get(FIELD_PATH);
            System.out.println("Hit: " + path);
        }

        for (int i = 0; i < hits.length(); i++) {
            tfv = indexReader.getTermFreqVector(i, FIELD_CONTENTS);
            System.out.println(tfv);
        }

    }
}

他に何をすべきかわかりません。助けてください。前もって感謝します。

file - ファイルの巨大な問題で lucene booleanquery を実行する

0 に答える 0

Related

Reference