perl - comonログ形式の分割

Question

私は、 CLF形式の行（apache access.logファイルのlinke）をperlで分割するための可能な限り最速の方法を見つけようとしています。何年にもわたって、彼らは数百万に蓄積してきました。以下は私がこれまでにテストしたものです。私の最後の試みは、正規表現を使用するよりもすでに高速です。

しかし-あなたはどう思いますか-それをより速くする方法はありますか？

1 2 3 4 - - 13/Jun/2007:03:20:15 +0200 GET / ..?,-" HTTP/1.0 200 202
1: [16909060] [1181697615] [GET / ..?,-" HTTP/1.0]
1.2.3.4 - - 13/Jun/2007:03:20:15 +0200 GET / ..?,-" HTTP/1.0 200 202
2: [16909060] [1181697615] [GET / ..?,-" HTTP/1.0]
202 200 1.2.3.4 - - 13/Jun/2007:03:20:15 +0200 GET / ..?,-" HTTP/1.0
3: [16909060] [1181697615] [GET / ..?,-" HTTP/1.0]
1.2.3.4 - - 13/Jun/2007:03:20:15 +0200 GET / ..?,-" HTTP/1.0 200 202
4: [16909060] [1181697615] [GET / ..?,-" HTTP/1.0]
GET / ..?,-" HTTP/1.0 13/Jun/2007:03:20:15 +0200 1.2.3.4 - - 200 202
5: [16909060] [1181697615] [GET / ..?,-" HTTP/1.0]
13/Jun/2007:03:20:15 +0200 GET / ..?,-" HTTP/1.0 1.2.3.4 - - 200 202
6: [16909060] [1181697615] [GET / ..?,-" HTTP/1.0]
---- hit <ENTER> to start Test ----
Benchmark: timing 100000 iterations of Method 1, Method 2, Method 3,
Method 4, Method 5, Method 6...
1: 39 wallclock s(37.64usr + 0.12sys = 37.77CPU) @2647.81/s(n=100000)
2: 39 wallclock s(38.35usr + 0.19sys = 38.53CPU) @2595.18/s(n=100000)
3: 39 wallclock s(37.19usr + 0.14sys = 37.33CPU) @2678.74/s(n=100000)
4: 38 wallclock s(36.80usr + 0.08sys = 36.88CPU) @2711.57/s(n=100000)
5: 38 wallclock s(36.93usr + 0.14sys = 37.07CPU) @2697.89/s(n=100000)
6: 38 wallclock s(36.11usr + 0.16sys = 36.27CPU) @2757.10/s(n=100000)

8X ----------------

#!/usr/bin/perl -w
use strict;
use warnings;
use FileHandle;
use Date::Parse;
use Benchmark;

STDOUT->autoflush(1); #....................................... autoflush STDOUT

our $s='1.2.3.4 - - [13/Jun/2007:03:20:15 +0200] "GET / ..?,-" HTTP/1.0" 200 202';
our (@T,$host,$timestamp,$request);

print "---- test functionality -----------------------------------\n";

split1(); print join(" ",@T)."\n1: [$host] [$timestamp] [$request]\n";
split2(); print join(" ",@T)."\n2: [$host] [$timestamp] [$request]\n";
split3(); print join(" ",@T)."\n3: [$host] [$timestamp] [$request]\n";
split4(); print join(" ",@T)."\n4: [$host] [$timestamp] [$request]\n";
split5(); print join(" ",@T)."\n5: [$host] [$timestamp] [$request]\n";
split6(); print join(" ",@T)."\n6: [$host] [$timestamp] [$request]\n";

print "---- hit <ENTER> to start Test ----"; <>;

timethese (
  100000,
  {'1' => '&split1',
   '2' => '&split2',
   '3' => '&split3',
   '4' => '&split4',
   '5' => '&split5',
   '6' => '&split6',
  }
);

exit(0);

1;

sub split1
{ $host='';$timestamp='';$request='';@T=();
  #----------------------------------------------------------------------------
  @T = $s =~ m/^(\d+)\.(\d+)\.(\d+)\.(\d+) (\S+) (\S+) \[(.+)\] "(\S+) (.*?) (\S+)" (\S+) (\S+)$/;
  #----------------------------------------------------------------------------
  $host=unpack("N",pack("C4",@T));
  $timestamp=str2time($T[6]);
  $request=join(" ",$T[7],$T[8],$T[9]);
}

sub split2
{ $host='';$timestamp='';$request='';@T=();
  #----------------------------------------------------------------------------
  @T=split(/ /,$s); 
  splice(@T,5,@T-7,join(" ",@T[5..(@T-3)]));
  splice(@T,3,2   ,join(" ",@T[3..4     ])); 
  chomp($T[6]); $T[3]=substr($T[3],1,-1); $T[4]=substr($T[4],1,-1);
  #----------------------------------------------------------------------------
  $host=unpack("N",pack("C4",split(/\./,$T[0]))); 
  $timestamp=str2time($T[3]);
  $request=$T[4];
}

sub split3
{ $host='';$timestamp='';$request='';@T=();
  #----------------------------------------------------------------------------
  my $i; my $x=$s; 
  $i=rindex($x,' ');push(@T,substr($x,$i+1)); $x=substr($x,0,$i);
  $i=rindex($x,' ');push(@T,substr($x,$i+1)); $x=substr($x,0,$i);
  $i=index($x,' ');push(@T,substr($x,0,$i));  $x=substr($x,$i+1,-1);
  $i=index($x,' ');push(@T,substr($x,0,$i));  $x=substr($x,$i+1);
  $i=index($x,' ');push(@T,substr($x,0,$i));  $x=substr($x,$i+2);
  $i=index($x,']');push(@T,substr($x,0,$i));  push(@T,substr($x,$i+3));
  #----------------------------------------------------------------------------
  $host=unpack("N",pack("C4",split(/\./,$T[2])));
  $timestamp=str2time($T[5]);
  $request=$T[6];
}

sub split4
{ $host='';$timestamp='';$request='';@T=();
  #----------------------------------------------------------------------------
  my $i; my $x=$s;
  $i=rindex($x,' ');$T[6]=substr($x,$i+1); $x=substr($x,0,$i);
  $i=rindex($x,' ');$T[5]=substr($x,$i+1); $x=substr($x,0,$i);
  $i= index($x,' ');$T[0]=substr($x,0,$i); $x=substr($x,$i+1,-1);
  $i= index($x,' ');$T[1]=substr($x,0,$i); $x=substr($x,$i+1);
  $i= index($x,' ');$T[2]=substr($x,0,$i); $x=substr($x,$i+2);
  $i= index($x,']');$T[3]=substr($x,0,$i); $T[4]=substr($x,$i+3);
  #----------------------------------------------------------------------------
  $host=unpack("N",pack("C4",split(/\./,$T[0])));
  $timestamp=str2time($T[3]);
  $request=$T[4];
}

sub split5
{ $host='';$timestamp='';$request='';@T=();
  #----------------------------------------------------------------------------
  my ($i,$j); my $x=$s;
  $i=index($x,'"')+1;
  $j=rindex($x,'"');
  $T[0]=substr($x,$i,$j-$i); 
  my $a=substr($x,0,$i-3);
  $i=rindex($a,'[');
  $T[1]=substr($a,$i+1); $a=substr($a,0,$i-1);
  $x=$a.substr($x,$j+1);
  push(@T,split(/ /,$x));      
  #----------------------------------------------------------------------------
  $request=$T[0];
  $timestamp=str2time($T[1]);
  $host=unpack("N",pack("C4",split(/\./,$T[2])));
}

sub split6
{ $host='';$timestamp='';$request='';@T=();
  #----------------------------------------------------------------------------
  my ($i,$j); my $x=$s;
  $i=index($x,'[');
  $j=rindex($x,'"');
  $T[0]=substr($x,$i+1,26);
  $T[1]=substr($x,$i+30,$j-$i-30);
  push(@T,split(/ /,substr($x,0,$i-1).substr($x,$j+1)));
  #----------------------------------------------------------------------------
  $timestamp=str2time($T[0]);
  $request=$T[1];
  $host=unpack("N",pack("C4",split(/\./,$T[2])));
}

8X ----------------

score 1 · Accepted Answer

amonが見つけた結果に基づいて、str2timeがボトルネックであるため、最初の分割をstr2timeとTime :: Pieceでテストすることを（任意に）選択しました。実際、より高速です。ラグがまだパーサーにあるかどうか（または現在OOモジュールを使用しているかどうか）を確認するためのプロファイルを作成していません。

#!/usr/bin/perl

use strict;
use warnings;

use FileHandle;
use Date::Parse;
use Time::Piece;
use Benchmark;

STDOUT->autoflush(1); #....................................... autoflush STDOUT

our $s='1.2.3.4 - - [13/Jun/2007:03:20:15 +0200] "GET / ..?,-" HTTP/1.0" 200 202';
our (@T,$host,$timestamp,$request);

print "---- test functionality -----------------------------------\n";

parse(); print join(" ",@T)."\n1: [$host] [$timestamp] [$request]\n";
piece(); print join(" ",@T)."\n2: [$host] [$timestamp] [$request]\n";

print "---- hit <ENTER> to start Test ----"; <>;

timethese (
  100000,
  {
   '1' => \&parse,
   '2' => \&piece,
  }
);

exit(0);

1;

sub parse
{ $host='';$timestamp='';$request='';@T=();
  #----------------------------------------------------------------------------
  @T = $s =~ m/^(\d+)\.(\d+)\.(\d+)\.(\d+) (\S+) (\S+) \[(.+)\] "(\S+) (.*?) (\S+)" (\S+) (\S+)$/;
  #----------------------------------------------------------------------------
  $host=unpack("N",pack("C4",@T));
  $timestamp=str2time($T[6]);
  $request=join(" ",$T[7],$T[8],$T[9]);
}

sub piece
{ $host='';$timestamp='';$request='';@T=();
  #----------------------------------------------------------------------------
  @T = $s =~ m/^(\d+)\.(\d+)\.(\d+)\.(\d+) (\S+) (\S+) \[(.+)\] "(\S+) (.*?) (\S+)" (\S+) (\S+)$/;
  #----------------------------------------------------------------------------
  $host=unpack("N",pack("C4",@T));
  $timestamp=Time::Piece->strptime($T[6], '%d/%b/%Y:%H:%M:%S %z')->epoch;
  $request=join(" ",$T[7],$T[8],$T[9]);
}

パワー不足のネットブックでは、次のようになります。

---- test functionality -----------------------------------
1 2 3 4 - - 13/Jun/2007:03:20:15 +0200 GET / ..?,-" HTTP/1.0 200 202
1: [16909060] [1181697615] [GET / ..?,-" HTTP/1.0]
1 2 3 4 - - 13/Jun/2007:03:20:15 +0200 GET / ..?,-" HTTP/1.0 200 202
2: [16909060] [1181697615] [GET / ..?,-" HTTP/1.0]
---- hit <ENTER> to start Test ----
Benchmark: timing 100000 iterations of 1, 2...
         1: 29 wallclock secs (27.58 usr +  1.03 sys = 28.61 CPU) @ 3495.28/s (n=100000)
         2: 11 wallclock secs (11.25 usr +  0.00 sys = 11.25 CPU) @ 8888.89/s (n=100000)

score 1 · Accepted Answer

最後に、私はこのアプローチを使用して見つけました

パック、代替品の開梱
ちょっとしたハッシュとTime::Local'timegm_nocheck'

これは、最初の試行よりも約4.5倍高速で、1分あたり約1.000.000のCLFラインを分割します。変更されたtimegm関数を使用すると、さらに高速になる可能性があります。

#!/usr/bin/perl -w
use strict;
use warnings;
use Date::Parse;
use Time::Piece;
use Time::Local 'timegm_nocheck';
use Benchmark;

our %midx = ('Jan'=>0,'Feb'=>1,'Mar'=>2,'Apr'=>3,'May'=>4,'Jun'=>5,
             'Jul'=>6,'Aug'=>7,'Sep'=>8,'Oct'=>9,'Nov'=>10,'Dec'=>11);

our $re = qr/\A
            (\d+)\.(\d+)\.(\d+)\.(\d+)
        [ ] (\S+)
        [ ] (\S+)
        [ ] \[(\d+)\/(\S+)\/(\d+):(\d+):(\d+):(\d+) [ ] (\S+)\]
        [ ] "(\S+) [ ] (.*?) [ ] (\S+)"
        [ ] (\S+)
        [ ] (\S+)
            \z/x;

my $s='1.2.3.4 - - [13/Jun/2007:03:20:15 +0200] "GET / ..?,-" HTTP/1.0" 200 202';

print "[".join('],[',split1ST($s))."]\n";
print "[".join('],[',splitCLF($s))."]\n";

[16909060]、[1181697615]、[/ ..？、-"]、[GET]、[HTTP / 1.0]、[200]、[202]、[-]、[-]

print "---- hit <ENTER> to start Test ----"; <>;

timethese (
  1000000,
  { 'split1ST' => '&split1ST($s)',
    'splitCLF' => '&splitCLF($s)',
  }
);

ベンチマーク：split1ST、splitCLFの1000000回の反復のタイミング...

split1ST：338ウォールクロック秒（329.54 usr + 0.30 sys = 329.83 CPU）@ 3031.85 / s（n = 1000000）

splitCLF：76ウォールクロック秒（73.79 usr + 0.16 sys = 73.94 CPU）@ 13523.75 / s（n = 1000000）

=>splitCLFは1回目の試行より4.46倍高速です

exit(0);

1;

sub split1ST
{ @T = $s =~ m/^(\d+)\.(\d+)\.(\d+)\.(\d+) (\S+) (\S+) \[(.+)\] "(\S+) (.*?) (\S+)" (\S+) (\S+)$/;
  return ( unpack("N",pack("C4",@T)), #.............................. host-IPv4
           str2time($7), #........................................... timestamp
           $9,$8,$10,$11,$12,$5,$6)  # request,method,pro,sta,bytes,authusr,usr
}

sub splitCLF 
{ shift =~ $re;
  return ( ((((($1<<8)|$2)<<8)|$3)<<8)|$4, #......................... host-IPv4 
           Time::Local::timegm_nocheck($12,$11,$10,$7,$midx{$8},$9)-$13*36, #ts
           $15,$14,$16,$17,$18,$5,$6) #request,method,pro,sta,bytes,authusr,usr
}

1;

score 0 · Accepted Answer

私は1時間かけて正規表現をいじり、頭をsplicesやsubstr恐怖、さらにはCコードに巻きつけました。次に、私は重要なことをしました。

# set the benchmark iterations down to ~ 1E4
$ perl -d:NYTProf the-script.pl
$ nytprofhtml
# open ./nytprof/index.html in browser

コードのプロファイルを作成しました（ Devel :: NYTProfを使用）。驚きではない：文字列の解析にはほとんど時間がかかりませんでした。の正規表現アプリケーションにsplit1は、合計で約144ミリ秒かかりました。ただし、日付の解析では、でなんと3.39秒が蓄積されましたstr2time。それはほぼ1:25の関係です！

結論：

時期尚早の最適化はすべての悪の根源です。–D .クヌース

次のような、読みやすい正規表現を使用します。

my $split1_1_regex = qr/\A
        (\d+)\.(\d+)\.(\d+)\.(\d+)
    [ ] (\S+)
    [ ] (\S+)
    [ ] \[( [^\]]+ )\]
    [ ] "(\S+ [ ] .*? [ ] \S+)"
    [ ] (\S+)
    [ ] (\S+)
\z/x;

これは、（r）index / substrホラーとほぼ同じ速度で実行されますが、ある程度自己文書化されており、確実にデバッグが容易です。これは、クリーンで慣用的なPerlが最速のPerlである可能性が高いという経験と一致しています。

次に、それを受け入れるかstr2time、それを最適化するかを選択できます。証明可能な高速化を管理している場合は、パッチをアップストリームに送信することを検討することをお勧めします。他のライブラリを試してみたりstr2time、特別なユースケースに最適化された独自の関数を作成したりすることもできます。

perl - comonログ形式の分割

3 に答える 3

結論：

Related

Reference