私は少し乱雑なコードで試したばかりの新しい perl です。
猫input1.txt
##gff-version 2
##source-version geneious 5.6.4
Xm_ABL1 Geneious CDS 1 168 . + . Name=Xm_ABL1;created by=User;modified by=User;ID=w0IVHutPuN4H4FVDCg4sFVRaJjQ.1340919460469.4
Xm_ABL1 Geneious CDS 169 334 . + . Name=Xm_ABL1;created by=User;modified by=User;ID=w0IVHutPuN4H4FVDCg4sFVRaJjQ.1340919460469.4
Xm_ABL1 Geneious CDS 335 628 . + . Name=Xm_ABL1;created by=User;modified by=User;ID=w0IVHutPuN4H4FVDCg4sFVRaJjQ.1340919460469.4
Xm_ABL1 Geneious CDS 629 901 . + . Name=Xm_ABL1;created by=User;modified by=User;ID=w0IVHutPuN4H4FVDCg4sFVRaJjQ.1340919460469.4
Xm_ABL1 Geneious CDS 902 985 . + . Name=Xm_ABL1;created by=User;modified by=User;ID=w0IVHutPuN4H4FVDCg4sFVRaJjQ.1340919460469.4
Xm_ABL1 Geneious CDS 986 1165 . + . Name=Xm_ABL1;created by=User;modified by=User;ID=w0IVHutPuN4H4FVDCg4sFVRaJjQ.1340919460469.4
Xm_ABL1 Geneious CDS 1166 1350 . + . Name=Xm_ABL1;created by=User;modified by=User;ID=w0IVHutPuN4H4FVDCg4sFVRaJjQ.1340919460469.4
Xm_ABL1 Geneious CDS 1351 1504 . + . Name=Xm_ABL1;created by=User;modified by=User;ID=w0IVHutPuN4H4FVDCg4sFVRaJjQ.1340919460469.4
Xm_ABL1 Geneious BLAST Hit 169 334 . + .
Xm_ABL1 Geneious extracted region 1 168 . + . Name=Extracted region from gi|371443098|gb|JH556762.1|;Extracted interval="351297 -> 351464"
Xm_ABL1 Geneious extracted region 169 334 . + . Name=Extracted region from gi|371443098|gb|JH556762.1|;Extracted interval="371785 -> 371950"
Xm_ABL1 Geneious extracted region 335 628 . + . Name=Extracted region from gi|371443098|gb|JH556762.1|;Extracted interval="372554 -> 372847"
Xm_ABL1 Geneious extracted region 629 901 . + . Name=Extracted region from gi|371443098|gb|JH556762.1|;Extracted interval="374760 -> 375032"
Xm_ABL1 Geneious extracted region 902 985 . + . Name=Extracted region from gi|371443098|gb|JH556762.1|;Extracted interval="375230 -> 375313"
Xm_ABL1 Geneious extracted region 986 1165 . + . Name=Extracted region from gi|371443098|gb|JH556762.1|;Extracted interval="375992 -> 376171"
Xm_ABL1 Geneious extracted region 1166 1350 . + . Name=Extracted region from gi|371443098|gb|JH556762.1|;Extracted interval="376575 -> 376759"
Xm_ABL1 Geneious extracted region 1351 1504 . + . Name=Extracted region from gi|371443098|gb|JH556762.1|;Extracted interval="376914 -> 377067"
入力ファイルに (->) 進む矢印が含まれている場合。 if($array[7]=~/.*interval=\"\d+ -> \d+\"$/gm){ $array[5]= "+"; }
猫出力1.txt
gi_371443098_gb_JH556762.1 gene 351297 377067 . + . Name=Xm_ABL1;created by=User;modified by=User;ID=w0IVHutPuN4H4FVDCg4sFVRaJjQ.1340919460469.4
gi_371443098_gb_JH556762.1 CDS 351297 351464 . + . Name=Xm_ABL1;created by=User;modified by=User;ID=w0IVHutPuN4H4FVDCg4sFVRaJjQ.1340919460469.4
gi_371443098_gb_JH556762.1 CDS 371785 371950 . + . Name=Xm_ABL1;created by=User;modified by=User;ID=w0IVHutPuN4H4FVDCg4sFVRaJjQ.1340919460469.4
gi_371443098_gb_JH556762.1 CDS 372554 372847 . + . Name=Xm_ABL1;created by=User;modified by=User;ID=w0IVHutPuN4H4FVDCg4sFVRaJjQ.1340919460469.4
gi_371443098_gb_JH556762.1 CDS 374760 375032 . + . Name=Xm_ABL1;created by=User;modified by=User;ID=w0IVHutPuN4H4FVDCg4sFVRaJjQ.1340919460469.4
gi_371443098_gb_JH556762.1 CDS 375230 375313 . + . Name=Xm_ABL1;created by=User;modified by=User;ID=w0IVHutPuN4H4FVDCg4sFVRaJjQ.1340919460469.4
gi_371443098_gb_JH556762.1 CDS 375992 376171 . + . Name=Xm_ABL1;created by=User;modified by=User;ID=w0IVHutPuN4H4FVDCg4sFVRaJjQ.1340919460469.4
gi_371443098_gb_JH556762.1 CDS 376575 376759 . + . Name=Xm_ABL1;created by=User;modified by=User;ID=w0IVHutPuN4H4FVDCg4sFVRaJjQ.1340919460469.4
gi_371443098_gb_JH556762.1 CDS 376914 377067 . + . Name=Xm_ABL1;created by=User;modified by=User;ID=w0IVHutPuN4H4FVDCg4sFVRaJjQ.1340919460469.4
###
cat output1.txt 入力ファイルに (<-) 逆矢印が含まれている場合。if($array[7]=~/.*interval=\"\d+ <- \d+\"$/gm){ $array[5]="-"; }
gi_371443098_gb_JH556762.1 gene 351297 377067 . - . Name=Xm_ABL1;created by=User;modified by=User;ID=w0IVHutPuN4H4FVDCg4sFVRaJjQ.1340919460469.4
gi_371443098_gb_JH556762.1 CDS 351297 351464 . - . Name=Xm_ABL1;created by=User;modified by=User;ID=w0IVHutPuN4H4FVDCg4sFVRaJjQ.1340919460469.4
gi_371443098_gb_JH556762.1 CDS 371785 371950 . - . Name=Xm_ABL1;created by=User;modified by=User;ID=w0IVHutPuN4H4FVDCg4sFVRaJjQ.1340919460469.4
gi_371443098_gb_JH556762.1 CDS 372554 372847 . - . Name=Xm_ABL1;created by=User;modified by=User;ID=w0IVHutPuN4H4FVDCg4sFVRaJjQ.1340919460469.4
gi_371443098_gb_JH556762.1 CDS 374760 375032 . - . Name=Xm_ABL1;created by=User;modified by=User;ID=w0IVHutPuN4H4FVDCg4sFVRaJjQ.1340919460469.4
gi_371443098_gb_JH556762.1 CDS 375230 375313 . - . Name=Xm_ABL1;created by=User;modified by=User;ID=w0IVHutPuN4H4FVDCg4sFVRaJjQ.1340919460469.4
gi_371443098_gb_JH556762.1 CDS 375992 376171 . - . Name=Xm_ABL1;created by=User;modified by=User;ID=w0IVHutPuN4H4FVDCg4sFVRaJjQ.1340919460469.4
gi_371443098_gb_JH556762.1 CDS 376575 376759 . - . Name=Xm_ABL1;created by=User;modified by=User;ID=w0IVHutPuN4H4FVDCg4sFVRaJjQ.1340919460469.4
gi_371443098_gb_JH556762.1 CDS 376914 377067 . - . Name=Xm_ABL1;created by=User;modified by=User;ID=w0IVHutPuN4H4FVDCg4sFVRaJjQ.1340919460469.4
###
私は初心者なので、少し厄介なコードで試しました。
#usr/bin/perl;
use strict;
open(FH,"$ARGV[0]");
while(<FH>){
chomp $_;
my @array=split("\t");
my $key="$array[2]-$array[0]-$array[1]-$array[2]-$array[3]";
if($array[1] eq "CDS"){
$cds_cnt{$key}++;
$cds{$key}="$array[4]\t$array[5]\t$array[6]\t$array[7]";
}
if($array[1] eq "extracted region"){
(my $pos1,my $pos2)=($array[7]=~/.*interval=\"(\d+) -> (\d+)\"$/gm);
$extract_cnt{$key}++;
$extract{$key}="$pos1\t$pos2";
}
}
foreach $i ( sort {$a<=>$b} keys %cds){
my $a=$i; #print "$i\n";
$a=~s/CDS/extracted region/g;
if($cds_cnt{$i} == $extract_cnt{$a}){
#print "$i\t$cds{$i}\n$a\t$extract{$a}\n";
my @array=split /\-/,$i;
my @pos=split "\t",$extract{$a};
print "$array[1]\t$array[2]\t$pos[0]\t$pos[1]\t$cds{$i}\n";
}
}
print "###";
アップデート
コードで変更する必要があるもの
1.抽出された領域の行から値を取得するには (つまり、array[7]=/gi|371443098|gb|JH556762.1|/)、任意の値にすることができます。アンダースコアを追加し (つまり、gi_371443098_gb_JH556762.1)、出力します。示されているように、output1.txt の array[0] にあります。
2. 印刷中に最初の行として新しい行を追加 (gi_371443098_gb_JH556762.1 遺伝子)、列 3 で CDS の開始値 (つまり 351297) を取得し、列 4 で CDS の終了値 (つまり 377067) を取得し、最初の行のように印刷します。 output1.txt に表示
3. If /extracted region/ block-all rows for.egExtracted interval="351297 -> 351464" (つまり、前方矢印) 配列 [5] を "+" 記号として出力に遺伝子ヘッダーを含めて出力します。if egExtracted interval="351297 <- 351464" (逆矢印) 出力に遺伝子ヘッダーを含む "-" 記号として array[5] を出力します。