コロケーションを使用する必要があるプロジェクトに取り組んでいます。それらを抽出するために次のコードを作成しました。このコードは文字列を受け取り、この文字列内のコロケーション パターンのリストを返します。タグ付けには Stanford POS を使用しました。
コードに関する提案が必要です。膨大な量のテキストを処理するため、非常に遅いようです。コードを改善するための提案は大歓迎です。
/**
*
* A COLLOCATION is an expression consisting of two or more words that
* correspond to some conventional way of saying things.
*
* I used the seventh Part-of-speech-tag patterns for collocation filtering that
* were suggested by Justeson and Katz(1995).
* These patterns are:
*
* -----------------------------------------
* |Tag | Pattern Example |
* -----------------------------------------
* |AN | linear function |
* |NN | regression coefficients |
* |AAN | Gaussian random variable |
* |ANN | cumulative distribution function |
* |NAN | mean squared error |
* |NNN | class probability function |
* |NPN | degrees of freedom |
* -----------------------------------------
* Where A=adjective, P=preposition, & N=noun.
*
* Stanford POS have been used for the extraction process.
* see: http://nlp.stanford.edu/software/tagger.shtml#Download
*
* more on collocation: http://nlp.stanford.edu/fsnlp/promo/colloc.pdf
* more on POS: http://acl.ldc.upenn.edu/J/J93/J93-2004.pdf
*
*/
public class GetCollocations {
public static ArrayList<String> GetCollocations(String text) throws IOException, ClassNotFoundException{
MaxentTagger tagger = new MaxentTagger("taggers/wsj-0-18-left3words.tagger");
String[] tagged = tagger.tagString(text).split("\\s+");
ArrayList<String> collocations = new ArrayList();
for (int i = 0; i < tagged.length; i++) {
String pot = tagged[i].substring(tagged[i].indexOf("_") + 1);
if (pot.equals("NN") || pot.equals("NNS") || pot.equals("NNP") || pot.equals("NNPS")) {
pot = tagged[i + 1].substring(tagged[i + 1].indexOf("_") + 1);
if (pot.equals("NN") || pot.equals("NNS") || pot.equals("NNP") || pot.equals("NNPS")) {
collocations.add(GetWordWithoutTag(tagged[i]) + " " + GetWordWithoutTag(tagged[i + 1]));
pot = tagged[i + 2].substring(tagged[i + 2].indexOf("_") + 1);
if (pot.equals("NN") || pot.equals("NNS") || pot.equals("NNP") || pot.equals("NNPS")) {
collocations.add(GetWordWithoutTag(tagged[i]) + " " + GetWordWithoutTag(tagged[i + 1]) + " " + GetWordWithoutTag(tagged[i + 2]));
}
} else if (pot.equals("JJ") || pot.equals("JJR") || pot.equals("JJS")) {
pot = tagged[i + 2].substring(tagged[i + 2].indexOf("_") + 1);
if (pot.equals("NN") || pot.equals("NNS") || pot.equals("NNP") || pot.equals("NNPS")) {
collocations.add(GetWordWithoutTag(tagged[i]) + " " + GetWordWithoutTag(tagged[i + 1]) + " " + GetWordWithoutTag(tagged[i + 2]));
}
} else if (pot.equals("IN")) {
pot = tagged[i + 2].substring(tagged[i + 2].indexOf("_") + 1);
if (pot.equals("NN") || pot.equals("NNS") || pot.equals("NNP") || pot.equals("NNPS")) {
collocations.add(GetWordWithoutTag(tagged[i]) + " " + GetWordWithoutTag(tagged[i + 1]) + " " + GetWordWithoutTag(tagged[i + 2]));
}
}
} else if (pot.equals("JJ") || pot.equals("JJR") || pot.equals("JJS")) {
pot = tagged[i + 1].substring(tagged[i + 1].indexOf("_") + 1);
if (pot.equals("NN") || pot.equals("NNS") || pot.equals("NNP") || pot.equals("NNPS")) {
collocations.add(GetWordWithoutTag(tagged[i]) + " " + GetWordWithoutTag(tagged[i + 1]));
pot = tagged[i + 2].substring(tagged[i + 2].indexOf("_") + 1);
if (pot.equals("NN") || pot.equals("NNS") || pot.equals("NNP") || pot.equals("NNPS")) {
collocations.add(GetWordWithoutTag(tagged[i]) + " " + GetWordWithoutTag(tagged[i + 1]) + " " + GetWordWithoutTag(tagged[i + 2]));
}
} else if (pot.equals("JJ") || pot.equals("JJR") || pot.equals("JJS")) {
pot = tagged[i + 2].substring(tagged[i + 2].indexOf("_") + 1);
if (pot.equals("NN") || pot.equals("NNS") || pot.equals("NNP") || pot.equals("NNPS")) {
collocations.add(GetWordWithoutTag(tagged[i]) + " " + GetWordWithoutTag(tagged[i + 1]) + " " + GetWordWithoutTag(tagged[i + 2]));
}
}
}
}
return collocations;
}
public static String GetWordWithoutTag(String wordWithTag){
String wordWithoutTag = wordWithTag.substring(0,wordWithTag.indexOf("_"));
return wordWithoutTag;
}
}