エラーが発生しましたが、誰かが次のコードで私を助けることができます: 前処理を実行することになっています
// program to perform preprocess
public static void main(String[] args) {
//public class PreProcess {
// Read a file into a string. Takes file path, returns string
/**
*
* @param path
* @return
*/
public String readFileIntoString(String path) {
char[] line = new char[1024];
StringBuilder dataString;
dataString = new StringBuilder(5000);
try {
try (BufferedReader input = new BufferedReader(new FileReader(path))) {
while (true) {
int readLength = input.read(line);
if (readLength == -1)
break;
dataString.append(line, 0, readLength);
}
}
return dataString.toString();
}
catch (IOException e) {
return " ";
}
}
// Removes stop words from a string. Takes stop word file path and returns
// string
public static String removeStopWords(String fileData, String stopWordFilePath) {
String newfile = fileData;
String line;
try {
BufferedReader input = new BufferedReader(new FileReader(stopWordFilePath));
while ((line = input.readLine()) != null) {
if (line.compareTo("") == 0)
continue;
line = " " + line + " ";
newfile = newfile.replaceAll(line, " ");
}
input.close();
}
catch (IOException e) {
e.printStackTrace();
}
return newfile;
}
public static String removeHTMLTags(String fileData) {
return strip(fileData);
}
// Filtering to a given windowsize for query terms. Takes query and size,
// returns string
public static String filterToWindow(String query, String fileData, int windowSize) {
StringBuffer dataString = new StringBuffer(5000);
String[] fileWords = fileData.split(" ");
String[] queryWords = query.split(" ");
int[] markWords = new int[fileWords.length];
for (int i = 0; i < fileWords.length; i++) {
markWords[i] = 0;
}
for (int i = 0; i < fileWords.length; i++) {
for (int j = 0; j < queryWords.length; j++) {
if (fileWords[i].compareTo(queryWords[j]) == 0) {
for (int k = 0; k < windowSize; k++) {
if (i + k < fileWords.length)
markWords[i + k] = 1;
if (i - k > 0)
markWords[i - k] = 1;
}
}
}
}
for (int i = 0; i < fileWords.length; i++) {
if (markWords[i] == 1) {
dataString.append(fileWords[i]);
dataString.append(" ");
}
}
return dataString.toString();
}
public static void extractMetaData(String fileData, String linkFilePath, int docId) {
int urlEnd = 0, urlStart = 0;
StringBuilder b3 = new StringBuilder();
StringBuilder b2 = new StringBuilder();
fileData = fileData.toLowerCase();
try {
String title = fileData.substring(fileData.indexOf("<title"), fileData.indexOf("</title>")).replaceAll("\\<.*?>", "");
writeStringIntoFile(title, linkFilePath + docId + ".title");
}
catch (Exception e) {
}
while (true) {
urlStart = fileData.indexOf("a href=\"", urlEnd) + 8;
if (urlStart == 7)
break;
urlEnd = fileData.indexOf('\"', urlStart + 1);
String link = fileData.substring(urlStart, urlEnd);
int linkstart = 0;
int linkend = link.length() - 1;
if (link.startsWith("http"))
link = link.substring(7);
while (link.startsWith("/"))
link = link.substring(1);
if (!link.startsWith("#")) {
if (link.indexOf('/') != -1)
link = link.substring(0, link.indexOf('/'));
if (!link.contains("wiki") && !link.contains("myspace.com") && !link.contains("javascript")) {
b3.append(link);
b3.append("\n");
}
}
}
writeStringIntoFile(b3.toString(), linkFilePath + docId + ".links");
urlEnd = 0;
while (true) {
urlStart = fileData.indexOf("src=\"", urlEnd) + 5;
if (urlStart == 4)
break;
urlEnd = fileData.indexOf('\"', urlStart + 1);
String link = fileData.substring(urlStart, urlEnd);
if (!link.startsWith("#")) {
if (!link.startsWith("/")) {
link = link.substring(0, link.lastIndexOf('/') + 1);
}
b2.append(link);
b2.append("\n");
}
}
writeStringIntoFile(b2.toString(), linkFilePath + docId + ".images");
}
// Saves a string to a file. Takes string and file path
public static void writeStringIntoFile(String fileData, String path) {
try {
try (BufferedWriter output = new BufferedWriter(new FileWriter(path))) {
output.write(fileData);
}
}
catch (IOException e) {
}
}
private static String strip(String inputString) {
inputString = inputString.replaceAll("\\<style.*?</style>", " ");
inputString = inputString.replaceAll("\\<script.*?</script>", " ");
inputString = inputString.replaceAll("\\<.*?>", " ").replaceAll("[^A-Za-z]+", " ").replaceAll("\\s+", " ");
inputString = inputString.trim();
// inputString = PorterStemmer.applyStemmer(inputString);
return inputString;
}
}
}