python - FASTQ ファイルのペアで対になっていない読み取りをフィルター処理する Python スクリプトのパフォーマンスを向上させる

Question

ペアエンドシーケンスデータがいくつかあります。ただし、ファイルが正しくペアリングされていません。読み取りペアファイルの一貫性を保つために削除する必要があるペアになっていない読み取りがあります。Trimmomatic を使用するような解決策はありますが。自分のスクリプトのパフォーマンスを向上させる方法について提案が必要です。現在のバージョンでは、1 秒あたり約 10K レコードが処理されます。

import sys


class FastqRecord(object):
    def __init__(self, block, unique_col=1, unique_type=int, sep=' '):
        self.block = "".join(block[:])
        self.header = block[0][1:].rstrip('\n')
        self.uid = unique_type(self.header.split(sep)[unique_col])

    def __eq__(self, other):
        return self.uid == other.uid

    def __gt__(self, other):
        return self.uid > other.uid

class Fastq(object):
    def __init__(self, filename):
        self.file = filename
        self.handle = open(self.file)
        self.count = 0
        self.record = self.next()

    def __iter__(self):
        return self

    def next(self):
        return FastqRecord([self.handle.next(), self.handle.next(), self.handle.next(), self.handle.next()])

    def update(self):
        try:
            self.record = self.next()
            self.count+=1
        except StopIteration:
            self.record = False
            self.handle.close()

def fn_from_path(path):
    return path.split('/')[-1].split('.')[0]

def flush_handle(obj, handle):
    handle.write(obj.record.block)
    for x in obj:
        handle.write(x.block)
    return True

@profile
def main():
    file1, file2, out_dir = (sys.argv[1], sys.argv[2], sys.argv[3].rstrip('/'))
    #file1, file2, out_dir = ('./test/SRR2130002_1.fastq', './test/SRR2130002_2.fastq', '.')
    out_handles = {
        'p1': open('%s/%s.fastq' % (out_dir, fn_from_path(file1)), 'w'),
        'p2': open('%s/%s.fastq' % (out_dir, fn_from_path(file2)), 'w'),
        's': open('%s/%s_unpaired.fastq' % (out_dir, fn_from_path(file1).split('_')[0]), 'w')
    }
    f1, f2 = Fastq(file1), Fastq(file2)
    while True:
        print "\r%9d %9d %9d" % (f1.count, f2.count, f1.count-f2.count),
        sys.stdout.flush()
        if f1.record is False or f2.record is False:
            if f1.record is False:
                flush_handle(f2, out_handles['s'])
            else:
                flush_handle(f1, out_handles['s'])
            break
        if f1.record == f2.record:
            out_handles['p1'].write(f1.record.block)
            out_handles['p2'].write(f2.record.block)
            f1.update()
            f2.update()
        elif f1.record > f2.record:
            out_handles['s'].write(f2.record.block)
            f2.update()
        else:
            out_handles['s'].write(f1.record.block)
            f1.update()
    for i in out_handles:
        out_handles[i].close()
    print "\n Done!"

if __name__ == "__main__":
    main()

これが私の FASTQ ファイルの外観です。

head SRR2130002_1.fasta

@SRR2130002.1 1 length=39
CCCTAACCCTAACCCTAACCCTAACCCAAAGACAGGCAA
+SRR2130002.1 1 length=39
AAAAA7F<<FF<F.<FFFFF77F.))))FA.))))))7A
@SRR2130002.2 2 length=39
TTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTATCTCGTA
+SRR2130002.2 2 length=39
<AAAAAA.7FFFFFFFFFF.<...).A.F7F7)A.FAA<
@SRR2130002.3 3 length=39
CTACCCCTAACCCTAACCCTAACAAAACCCCAAAACAAA

ありがとう

更新: main() 関数でラインプロファイラー (kernprof) を実行しました。そして、次の統計を得ました：

タイマー単位：1e-06秒

合計時間: 0.808629 秒ファイル: paired_extractor.py 関数: main at line 46

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
    46                                           @profile
    47                                           def main():
    48         1            4      4.0      0.0      file1, file2, out_dir = (sys.argv[1], sys.argv[2], sys.argv[3].rstrip('/'))
    49                                               #file1, file2, out_dir = ('./test/SRR2130002_1.fastq', './test/SRR2130002_2.fastq', '.')
    50         1            1      1.0      0.0      out_handles = {
    51         1         4830   4830.0      0.6          'p1': open('%s/%s.fastq' % (out_dir, fn_from_path(file1)), 'w'),
    52         1         4118   4118.0      0.5          'p2': open('%s/%s.fastq' % (out_dir, fn_from_path(file2)), 'w'),
    53         1         1283   1283.0      0.2          's': open('%s/%s_unpaired.fastq' % (out_dir, fn_from_path(file1).split('_')[0]), 'w')
    54                                               }
    55         1         2945   2945.0      0.4      f1, f2 = Fastq(file1), Fastq(file2)
    56     25001        23354      0.9      2.9      while True:
    57     25001       105393      4.2     13.0          print "\r%9d %9d %9d" % (f1.count, f2.count, f1.count-f2.count),
    58     25001        68264      2.7      8.4          sys.stdout.flush()
    59     25001        29858      1.2      3.7          if f1.record is False or f2.record is False:
    60         1            1      1.0      0.0              if f1.record is False:
    61         1         9578   9578.0      1.2                  flush_handle(f2, out_handles['s'])
    62                                                       else:
    63                                                           flush_handle(f1, out_handles['s'])
    64         1            1      1.0      0.0              break
    65     25000        47062      1.9      5.8          if f1.record == f2.record:
    66     23593        41546      1.8      5.1              out_handles['p1'].write(f1.record.block)
    67     23593        34040      1.4      4.2              out_handles['p2'].write(f2.record.block)
    68     23593       216319      9.2     26.8              f1.update()
    69     23593       196077      8.3     24.2              f2.update()
    70      1407         2340      1.7      0.3          elif f1.record > f2.record:
    71                                                       out_handles['s'].write(f2.record.block)
    72                                                       f2.update()
    73                                                   else:
    74      1407         2341      1.7      0.3              out_handles['s'].write(f1.record.block)
    75      1407        13231      9.4      1.6              f1.update()
    76         4            9      2.2      0.0      for i in out_handles:
    77         3         6023   2007.7      0.7          out_handles[i].close()
    78         1           11     11.0      0.0      print "\n Done!"

score 0 · Accepted Answer

プロファイラーを使用してデューデリジェンスを行わずにコードを最適化することは通常、困難/不可能であるという事実に注意してください (fasta ファイルがないため、これを実行しようとしています)。また、文字列に関するいくつかの良い習慣があるようです ( の使用などjoin)。必要ないようですね

print "\r%9d %9d %9d" % (f1.count, f2.count, f1.count-f2.count),
sys.stdout.flush()

これは統計をコンソールに出力しているだけです。それを取り除くことで少しスピードアップします...しかし、私には大きなことは何もありませんでした。

個人的な意見として、一般的な処理速度が必要な場合は、C/C++ を使用することをお勧めします。インタープリターのオーバーヘッドはありません。Python インタープリターはすばらしいツールですが、速度を重視して構築されたわけではありません。正確性/セキュリティ/使いやすさのために構築されました。特に文字列操作に関しては、Perl は私が聞いた限りでは王様ですが、確かに私は人生で数十行しか Perl を書いたことがありません。

python - FASTQ ファイルのペアで対になっていない読み取りをフィルター処理する Python スクリプトのパフォーマンスを向上させる

1 に答える 1

Related

Reference