0
157.55.39.136 - - [31/Dec/2015:18:44:25 +0000] "GET /robots.txt HTTP/1.1"  200 784 "-" "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)" 248 1120 - 3653
157.55.39.136 - - [31/Dec/2015:18:44:25 +0000] "GET /robots.txt HTTP/1.1" 200 784 "-" "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)" 248 1120 - 5282
199.79.62.54 - - [31/Dec/2015:18:08:01 +0000] "GET /main/cron/run.php HTTP/1.0" 200 73 "-" "Wget/1.11.4 Red Hat modified" 133 289 - 118415
157.55.39.38 - - [31/Dec/2015:18:44:15 +0000] "GET /results/ HTTP/1.1" 200 5622 "-" "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)" 246 5915 - 206759
157.55.39.136 - - [31/Dec/2015:18:44:21 +0000] "GET /robots.txt HTTP/1.1" 200 784 "-" "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)" 267 1120 - 5492
40.77.167.52 - - [31/Dec/2015:18:47:58 +0000] "GET /robots.txt HTTP/1.1" 200 784 "-" "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)" 267 1120 - 5860
40.77.167.52 - - [31/Dec/2015:18:48:04 +0000] "GET /results/jquery.js HTTP/1.1" 200 72174 "-" "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)" 255 72526 - 5675
157.55.39.38 - - [31/Dec/2015:18:43:52 +0000] "GET /robots.txt HTTP/1.1" 200 784 "-" "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)" 267 1120 - 5203
123.125.71.70 - - [31/Dec/2015:18:49:16 +0000] "GET /robots.txt HTTP/1.1" 200 784 "-" "Mozilla/5.0 (Windows NT 5.1; rv:6.0.2) Gecko/20100101 Firefox/6.0.2" 217 1084 - 6847
66.249.65.71 - - [31/Dec/2015:19:11:33 +0000] "GET / HTTP/1.1" 200 24425 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" 289 25071 - 336021
105.224.92.45 - - [31/Dec/2015:19:06:27 +0000] "GET /results/css/images/logo.jpg HTTP/1.1" 200 3848 "http://ieeephd.com/results/" "Mozilla/5.0 (Windows NT 10.0; Trident/7.0; rv:11.0) like Gecko" 350 4185 - 8041

次のJavaコードを使用してデータを前処理しています

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
public class mp {



    public static void main(String[] args) throws IOException{

        BufferedReader br =new BufferedReader(new FileReader ("data/kk.data"));
        BufferedWriter bw  =new BufferedWriter(new FileWriter("data/samlog.data"));

        String line;
        while((line = br.readLine())!= null) {
            String[] values =line.split(" ",-1);
            values[3]=values[3].replaceAll("[\\[\\](){}]","");
            //values[3] = values[3].replace(/:/g,'');
            String k=values[10]+values[11]+values[12]+values[13]+values[14];
            String n=k.substring(k.lastIndexOf(":")+1);
            bw.write(values[0]+","+values[3]+","+values[5]+"\","+n);
            bw.newLine();
        }


        br.close();
        bw.close();
 }
}

次のように出力を取得しています

157.55.39.136,31/Dec/2015:18:44:25,"GET",//www.bing.com/bingbot.htm)"
157.55.39.136,31/Dec/2015:18:44:25,"GET",//www.bing.com/bingbot.htm)"
199.79.62.54,31/Dec/2015:18:08:01,"GET","-""Wget/1.11.4RedHatmodified"
157.55.39.38,31/Dec/2015:18:44:15,"GET",//www.bing.com/bingbot.htm)"
157.55.39.136,31/Dec/2015:18:44:21,"GET",//www.bing.com/bingbot.htm)"
40.77.167.52,31/Dec/2015:18:47:58,"GET",//www.bing.com/bingbot.htm)"
40.77.167.52,31/Dec/2015:18:48:04,"GET",//www.bing.com/bingbot.htm)"
157.55.39.38,31/Dec/2015:18:43:52,"GET",//www.bing.com/bingbot.htm)"
123.125.71.70,31/Dec/2015:18:49:16,"GET","-""Mozilla/5.0(WindowsNT5.1;
66.249.65.71,31/Dec/2015:19:11:33,"GET",//www.google.com/bot.html)"
105.224.92.45,31/Dec/2015:19:06:27,"GET",//ieeephd.com/results/""Mozilla/5.0(WindowsNT10.0;

しかし、次のような出力が必要です

157.55.39.136,31/Dec/2015:18:44:25,"GET",www.bing.com/bingbot.htm
157.55.39.136,31/Dec/2015:18:44:25,"GET",www.bing.com/bingbot.html
157.55.39.38,31/Dec/2015:18:44:15,"GET",www.bing.com/bingbot.htm
157.55.39.136,31/Dec/2015:18:44:21,"GET",www.bing.com/bingbot.htm
40.77.167.52,31/Dec/2015:18:47:58,"GET",www.bing.com/bingbot.htm
40.77.167.52,31/Dec/2015:18:48:04,"GET",www.bing.com/bingbot.htm
157.55.39.38,31/Dec/2015:18:43:52,"GET",www.bing.com/bingbot.htm
66.249.65.71,31/Dec/2015:19:11:33,"GET",www.google.com/bot.html
105.224.92.45,31/Dec/2015:19:06:27,"GET",//ieeephd.com/results/

このデータを前処理したり、適切な正規表現を提案したりできますか

4

2 に答える 2

0

" "空白を含む列がある場合は、分割しないでください。次に、ユーザー エージェント列を酸性に分割するためです。

代わりに、引用符で囲まれた文字列にはLR(1) パーサーを使用することをお勧めします。正規表現でこれを行うこともできますが、ハックになります。

大まかに、3 つの代替一致を記述する必要があります。

  1. 引用符付き文字列
  2. 引用符で囲まれていない文字列
  3. 空の文字列

次の空白のゼロ幅先読み。

LR パーサーは、はるかにクリーンなソリューションです。適切なパーサーは、ほとんどの CSV リーダーで見つけることができます。

于 2016-03-03T21:51:34.623 に答える
0

正規表現を使用できます。
正規表現を使用したコードは次のとおりです

File file = new File("data.txt");
Pattern ip = Pattern.compile("(\\d{1,3}\\.){3}\\d{1,3}");
Pattern date = Pattern.compile("\\[.*(?= \\+\\d+\\])");
Pattern type = Pattern.compile("\"\\w+(?= )");
Pattern url = Pattern.compile("(https?:\\/\\/)([\\da-z\\.-]+)\\.([a-z\\.]{2,6})([\\/\\w \\.-]*)*\\/?");
Scanner s = new Scanner(file);
while (s.hasNextLine()) {
    String text = s.nextLine();
    Matcher[] m = new Matcher[]{ip.matcher(text), date.matcher(text), type.matcher(text), url.matcher(text)};
    boolean isMatch = true;
    for (Matcher matcher : m) {
        if (!matcher.find()) {
            isMatch = false;
            break;
        }
    }
    if (isMatch) {
        System.out.println(m[0].group() + "," + m[1].group().substring(1) + "," + m[2].group() + "\"," + m[3].group().replaceAll("http://", "").replaceAll("https://", ""));
    }
}

出力は次のとおりです。

157.55.39.136,31/Dec/2015:18:44:25,"GET",www.bing.com/bingbot.htm
157.55.39.136,31/Dec/2015:18:44:25,"GET",www.bing.com/bingbot.htm
157.55.39.38,31/Dec/2015:18:44:15,"GET",www.bing.com/bingbot.htm
157.55.39.136,31/Dec/2015:18:44:21,"GET",www.bing.com/bingbot.htm
40.77.167.52,31/Dec/2015:18:47:58,"GET",www.bing.com/bingbot.htm
40.77.167.52,31/Dec/2015:18:48:04,"GET",www.bing.com/bingbot.htm
157.55.39.38,31/Dec/2015:18:43:52,"GET",www.bing.com/bingbot.htm
66.249.65.71,31/Dec/2015:19:11:33,"GET",www.google.com/bot.html
105.224.92.45,31/Dec/2015:19:06:27,"GET",ieeephd.com/results/
于 2016-03-03T23:23:31.603 に答える