0

少し前にAsk Ubuntuでこの質問をしたところ、おそらく最適化の問題であるため、ここにコードを提供するように指示されました。スクリプト全体を含めました。

一般的な目標は、ディレクトリ内の約 7000 個の HTML ファイルを調べて、それらから特定の情報を解析し、それを 1 行としてテキスト ファイルにエクスポートすることです。

#!/usr/bin/perl

use Switch;
use strict;

use HTML::Query 'Query';

my $dir = '/home/mark/Documents/Perl/garchivesfiles/completeresults';

opendir my $dh, $dir or die "Can't open $dir: $!";
my @files = map {"$dir/$_"} grep { $_ !~ /^\./ } readdir $dh;
closedir $dh;

my $total;

my %xlateNum2Text =  qw (0   January
                         1   Febuary
                         2   March
                         3   April
                         4   May
                         5   June
                         6   July
                         7   August
                         8   September
                         9   October
                         10  November
                         11  December                  
                       );


my $inc = 0;
foreach my $file (@files) {
    open FILE, $file;
    my $html = do { local $/; <FILE> };
    my $q = Query(text => $html);

    my @homescore = $q->query("span.homeScore");
    my @awayscore = $q->query("span.awayScore");
    my $singlehomescore = $homescore[0]->as_text();
    my $singleawayscore = $homescore[0]->as_text();

    my @hometeam = $q->query("table.teaminfo td.home span");
    my @awayteam = $q->query("table.teaminfo td.away span");
    my $singlehometeam = rightTeamName($hometeam[0]->as_text());
    my $singleawayteam = rightTeamName($awayteam[0]->as_text());


    my @homegoalstotal;
    my @awaygoalstotal;
    my @datearray;
    my @fixtureinfo;

    my @newhomegoals;
    my @newawaygoals;

    my @allinfogoals;

    if($singlehomescore ne "0" || $singleawayscore ne "0") {
        @homegoalstotal = $q->query("div.home ul li");
        @awaygoalstotal = $q->query("div.away ul li");
        my $i = 0;

        @datearray = $q->query("p.fixtureinfo span");
        my $finaldate = $datearray[0]->as_text();
        my @datecomponents = split(" ", $finaldate);
        my $mysqlyyyy = $datecomponents[3];
        my $mysqlmm = monthConvert($datecomponents[2]); 
        my $mysqldd = $datecomponents[1];

        my $mysqldate;

        if(length($mysqlmm) == 1) {
            $mysqlmm = "0".$mysqlmm;
        }

        if(length($mysqldd) == 1) {
            $mysqldd = "0".$mysqldd;
        }

        $mysqldate = $mysqlyyyy."-".$mysqlmm."-".$mysqldd;


        @fixtureinfo = $q->query("p.fixtureinfo");
        my $fixtureinfoinit = $fixtureinfo[0]->as_text();
        my @fixtureinfobrokenup = split(/ \| /, $fixtureinfoinit);
        my $fixtureinfostring = $fixtureinfobrokenup[1];

        foreach my $goal (@homegoalstotal) {
            my $tempmodifier = $goal->as_text();
            $tempmodifier =~ s/\)//g;
            my @tempcomponents = split(' \(', $tempmodifier);
            my $substitutetemp;
            my @extratimesplit;
            my $compositetime;

            if(index($tempcomponents[1], ",") != -1) {
                my @goaltimes = split('\,', $tempcomponents[1]);
                foreach my $individmultgoal (@goaltimes) {
                    $individmultgoal =~ s/Pen//g;
                    $individmultgoal =~ s/ //g;
                    if(index($individmultgoal, "OG") == -1) {
                        if(index($individmultgoal, "+") != -1) {
                            @extratimesplit = split('\+', $individmultgoal);
                            $compositetime = $extratimesplit[0];
                            push (@{$allinfogoals[$i]}, ($tempcomponents[0], $compositetime, "for:".$singlehometeam, $singleawayteam, $datecomponents[1], ,$datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, $extratimesplit[1]));
                            $i++;
                        } else {
                            push (@{$allinfogoals[$i]}, ($tempcomponents[0], $individmultgoal, "for:".$singlehometeam, $singleawayteam, $datecomponents[1], ,$datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, 0));
                            $i++;
                        }   
                    }
                }
            } else {
                $substitutetemp = $tempcomponents[1];
                $substitutetemp =~ s/Pen//g;
                $substitutetemp =~ s/ //g;
                if(index($substitutetemp, "OG") == -1) {
                    if(index($substitutetemp, "+") != -1) {
                        @extratimesplit = split('\+', $substitutetemp);
                        $compositetime = $extratimesplit[0];
                        push (@{$allinfogoals[$i]}, ($tempcomponents[0], $compositetime, "for:".$singlehometeam, $singleawayteam, $datecomponents[1], $datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, $extratimesplit[1]));
                        $i++;
                    } else {
                        push (@{$allinfogoals[$i]}, ($tempcomponents[0], $substitutetemp, "for:".$singlehometeam, $singleawayteam, $datecomponents[1], $datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, 0));
                        $i++;
                    }
                }
            }
        }

        foreach my $goal (@awaygoalstotal) {
            my $tempmodifier2 = $goal->as_text();
            $tempmodifier2 =~ s/\)//g;
            my @tempcomponents2 = split(' \(', $tempmodifier2);
            my $substitutetemp2;
            my @extratimesplit2;
            my $compositetime2;

            if(index($tempcomponents2[1], ",") != -1) {
                my @goaltimes2 = split('\,', $tempcomponents2[1]);
                foreach my $individmultgoal2 (@goaltimes2) {
                    $individmultgoal2 =~ s/Pen//g;
                    $individmultgoal2 =~ s/ //g;
                    if(index($individmultgoal2, "OG") == -1) {
                        if(index($individmultgoal2, "+") != -1) {
                            @extratimesplit2 = split('\+', $individmultgoal2);
                            $compositetime2 = $extratimesplit2[0];
                            push (@{$allinfogoals[$i]}, ($tempcomponents2[0], $compositetime2, "for:".$singleawayteam, $singlehometeam, $datecomponents[1], $datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, $extratimesplit2[1]));
                            $i++;
                        } else {
                            push (@{$allinfogoals[$i]}, ($tempcomponents2[0], $individmultgoal2, "for:".$singleawayteam, $singlehometeam, $datecomponents[1], $datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, 0));
                            $i++;
                        }
                    }
                }
            } else {
                $substitutetemp2 = $tempcomponents2[1];
                $substitutetemp2 =~ s/Pen//g;
                $substitutetemp2 =~ s/ //g;
                if(index($substitutetemp2, "OG") == -1) {
                    if(index($substitutetemp2, "+") != -1) {
                        @extratimesplit2 = split('\+', $substitutetemp2);
                        $compositetime2 = $extratimesplit2[0];
                        push(@{$allinfogoals[$i]}, ($tempcomponents2[0], $compositetime2, "for:".$singleawayteam, $singlehometeam, $datecomponents[1], $datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, $extratimesplit2[1]));
                        $i++;
                    } else {
                        push(@{$allinfogoals[$i]}, ($tempcomponents2[0], $substitutetemp2, "for:".$singleawayteam, $singlehometeam, $datecomponents[1], $datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, 0));
                        $i++;
                    }
                }
            }
        }


        @allinfogoals = sort { $a->[1] <=> $b->[1] || $a->[12] <=> $b->[12] } @allinfogoals;

        open(GOALCSV, '>>goalcsv.txt');

        my $homegoalcount = 0;
        my $awaygoalcount = 0;

        foreach my $row(@allinfogoals){
           foreach my $val(@$row){
                if($val eq "for:".$singlehometeam) {
                    $homegoalcount++;
                    print GOALCSV "$val,".$homegoalcount.",".$awaygoalcount.",true,";
                    print "$val,".$homegoalcount.",".$awaygoalcount.",true,";
                } elsif($val eq "for:".$singleawayteam) {
                    $awaygoalcount++;
                    print GOALCSV "$val,".$awaygoalcount.",".$homegoalcount.",false,";
                    print "$val,".$awaygoalcount.",".$homegoalcount.",false,";
                } else {
                    print GOALCSV "$val,";
                    print "$val,";
                }
           }
           print GOALCSV "\n";
           print "\n";
        }
    }

}

sub rightTeamName{
    my $teamname = $_[0];

    switch($teamname) {
        case "Nott'm Forest" { return "Nottingham Forest" }
        case "QPR"  { return "Queens Park Rangers" }
        case "Southampton" { return "Southampton FC" }
        case "Norwich" { return "Norwich City" }
        case "Tottenham" { return "Tottenham Hotspur" }
        case "Leeds" { return "Leeds United" }
        case "Middlesbrough" { return "Middlesbrough FC" }
        case "Chelsea" { return "Chelsea FC" }
        case "Arsenal" { return "Arsenal FC" }
        case "Oldham" { return "Oldham Athletic" }
        case "Ipswich" { return "Ipswich Town" }
        case "Man Utd" { return "Manchester United" }
        case "Man City" { return "Manchester City" }
        case "Sheffield Wed" { return "Sheffield Wednesday" }
        case "Man City" { return "Manchester City" }
        case "Blackburn" { return "Blackburn Rovers" }
        case "Wimbledon" { return "AFC Wimbledon" }
        case "Liverpool" { return "Liverpool FC" }
        case "Coventry" { return "Coventry City" }
        else        { return $teamname }

    }
}

sub monthConvert{
        switch($_[0]) {
            case "January" { return 1 }
            case "February" { return 2 }
            case "March" { return 3 }
            case "April" { return 4 }
            case "May" { return 5 }
            case "June" { return 6 }
            case "July" { return 7 }
            case "August" { return 8 }
            case "September" { return 9 }
            case "October" { return 10 }
            case "November" { return 11}
            case "December" { return 12 }
        }
}
4

2 に答える 2

3

HTML::Query は HTML::Element と HTML::TreeBuilder を使用してドキュメントのノードをモデル化します。ノードは複雑な方法で接続されているため、Perl ガベージ コレクターがノードをクリーンアップできません。したがって、次のいずれかを行う必要があります

  • 弱い参照を使用できる HTML::Element のバージョンがあることを表明します。これらはガベージ コレクションを妨げません。Ause HTML::TreeBuilder 5 -weakはトリックを行う必要があります。

  • deleteメソッドの任意の結果を呼び出しqueryます。

詳細については、ドキュメント ( HTML::Elementなど) を参照してください。

以下は、コードの重複を削減しようとするスクリプトのクリーンアップ バージョンです (元のコードにはコピー アンド ペーストの明確な兆候がありました)。まだ美しくはありませんし、WTF もいくつか残っていますが、保守性が向上するはずです。特に、12 番目の列@allinfogoalsが (並べ替えで) 一体何なのか、またはなぜ CSV がこのかなり奇妙な方法で出力されるのかはわかりません (for:列のインデックス (→ 2) は既にわかっているので、わかりません)。期待値に対してすべての列を一致させる必要があります)。

欠落している if-else を理解するためのヒント: 文字列に特定の部分文字列が含まれていない場合、その部分文字列で文字列を分割した結果の戻り値は、元の文字列と等しくなります。コードとして:

use Test::More;
my ($string, $substring) = ("foo+bar", "-"); # try it yourself!
my ($split) = split /\Q$substring\E/, $string;
if (-1 == index $string, $substring) {
  is $split, $string;
} else {
  isnt $split, $string;
}
done_testing;

クリーンアップされたバージョンは次のとおりです。

#!/usr/bin/perl

use strict; use warnings;

use HTML::TreeBuilder 5 -weak;
use HTML::Query;

my $dir = '/home/mark/Documents/Perl/garchivesfiles/completeresults';

opendir my $dh, $dir or die "Can't open $dir: $!";

while (my $filename = readdir $dh) {
    next if $filename =~ /^\./;
    my $q = HTML::Query->new(file => "$dir/$filename");

    my $homescore = $q->query("span.homeScore")->first->as_text;
    my $awayscore = $q->query("span.awayScore")->first->as_text;

    my $hometeam = correctTeamName($q->query("table.teaminfo td.home span")->first->as_text);
    my $awayteam = correctTeamName($q->query("table.teaminfo td.away span")->first->as_text);

    my @allinfogoals;

    if($homescore ne "0" || $awayscore ne "0") {

        my ($fixtureinfo_span) = $q->query("p.fixtureinfo span");
        my (undef, $day, $month, $year) = split ' ', $fixtureinfo_span->as_text;
        my $mysqldate = sprintf '%04d-%02d-%02d', $year, monthConvert($month), $day;

        my ($fixtureinfo) = $q->query('p.fixtureinfo');
        my (undef, $fixtureinfostring) = split / \| /, $fixtureinfo->as_text;

        for my $goal_list (
            [$hometeam, $awayteam, [$q->query("div.home ul li")->as_text]],
            [$awayteam, $hometeam, [$q->query("div.away ul li")->as_text]]
        ) {
            my ($thisteam, $otherteam, $goalstotal) = @$goal_list;
            for my $goal (@$goalstotal) {
                $goal =~ s/\)//g;
                my ($tempcomponent_1, $tempcomponent) = split / \(/, $goal;

                for my $individmultgoal (split/,/, $tempcomponent) {
                    next if -1 != index $individmultgoal, 'OG';
                    $individmultgoal =~ s/Pen//g;
                    $individmultgoal =~ s/ //g;
                    my @timesplit = 
                        (index($individmultgoal, "+") != -1)
                        ? (split /\+/, $individmultgoal)
                        : ($individmultgoal, 0);
                    push @allinfogoals, [
                        $tempcomponent_1,
                        $timesplit[0],
                        "for:$thisteam",
                        $otherteam,
                        $day,
                        $month,
                        $year,
                        $fixtureinfostring,
                        "Barclays Premier League",
                        monthConvert($month),
                        $mysqldate,
                        $timesplit[1],
                    ];
                }
            }
        }

        @allinfogoals = sort { $a->[1] <=> $b->[1] || $a->[12] <=> $b->[12] } @allinfogoals;

        open my $GOALCSV, '>>', 'goalcsv.txt' or die "Can't open goalcsv.txt: $!";

        my $print_both = sub {
            print {$GOALCSV} @_;
            print            @_;
        };

        my $homegoalcount = 0;
        my $awaygoalcount = 0;

        for my $row (@allinfogoals){
            for my $val(@$row){
                if($val eq "for:$hometeam") {
                    $homegoalcount++;
                    $print_both->("$val,$homegoalcount,$awaygoalcount,true,");
                } elsif($val eq "for:$awayteam") {
                    $awaygoalcount++;
                    $print_both->("$val,$awaygoalcount,$homegoalcount,false,");
                } else {
                    $print_both->("$val,");
                }
            }
            $print_both->("\n");
        }
    }
}

closedir $dh;

sub correctTeamName{
    my %teamnames = (
        "Nott'm Forest" => "Nottingham Forest",
        "QPR"           => "Queens Park Rangers",
        "Southampton"   => "Southampton FC",
        "Norwich"       => "Norwich City",
        "Tottenham"     => "Tottenham Hotspur",
        "Leeds"         => "Leeds United",
        "Middlesbrough" => "Middlesbrough FC",
        "Chelsea"       => "Chelsea FC",
        "Arsenal"       => "Arsenal FC",
        "Oldham"        => "Oldham Athletic",
        "Ipswich"       => "Ipswich Town",
        "Man Utd"       => "Manchester United",
        "Man City"      => "Manchester City",
        "Sheffield Wed" => "Sheffield Wednesday",
        "Man City"      => "Manchester City",
        "Blackburn"     => "Blackburn Rovers",
        "Wimbledon"     => "AFC Wimbledon",
        "Liverpool"     => "Liverpool FC",
        "Coventry"      => "Coventry City",
    );
    return exists $teamnames{$_[1]} ? $teamnames{$_[1]} : $_[0];
}

sub monthConvert{
    my $i = 1;
    my %months = map { $_ => $i++ } qw/
        January February    March
        April   May         June
        July    August      September
        October November    December
    /;
    exists $months{$_[0]} or die "Unknown month name $_[0]";
    return $months{$_[0]};
}

注: サンプル ファイルが提供されていないため、コードはテストされていません。少なくともコンパイルされます。

于 2013-05-17T10:28:53.133 に答える