コードを見てください。genMotifs のパラメーターが n_seq=5000 および n_pos=10 に設定されている場合、並列バージョンの getPairedSeqNames3 および getPairedSeqNames1 ははるかに遅くなります。ただし、n_seq=50 および n_pos=2000 の場合、並列バージョンのパフォーマンスが向上します。残念ながら、私が扱っているデータは、n_seq=5000 と n_pos=10 のようなものです。これが起こる理由を誰か教えてもらえますか?n_seq=5000 および n_pos=10 の場合、並列バージョンのパフォーマンスを向上させる方法はありますか?
コードは次のとおりです。
#! /usr/bin/env python
import pp, sys, random, time
def getMotif_SeqName(Motifs):
return dict([(uid, set(Motifs[uid].keys())) for uid in Motifs.keys()])
def getPairedList(uids):
return [(id1, id2) for i, id1 in enumerate(uids) for id2 in uids[i:] if id1 != id2]
def is_overlap(pos_pair):
(posA, posB) = pos_pair
if max(posA) < min(posB) or min(posA) > max(posB):
return False
else:
return True
def caclDist(pos_pair):
(posA, posB) = pos_pair
d1 = min(posB) - max(posA)
d2 = min(posA) - max(posB)
return {True: d1, False: -d2}[d1 > d2]
def getDist(posA, posB, low, high):
comb = [(i, j) for i in posA for j in posB]
not_overlap = [e for e in comb if not is_overlap(e)]
distances = map(caclDist, not_overlap)
CoDist = {}
for i, d in enumerate(distances):
if abs(d) >= low and abs(d) <= high:
CoDist[not_overlap[i]] = d
return CoDist
def getDist2(uidA, uidB, seqname, posA, posB, low, high):
comb = [(i, j) for i in posA for j in posB]
not_overlap = [e for e in comb if not is_overlap(e)]
distances = map(caclDist, not_overlap)
CoDist = {}
for i, d in enumerate(distances):
if abs(d) >= low and abs(d) <= high:
CoDist[not_overlap[i]] = d
return (uidA, uidB, seqname, CoDist)
def ppCacl(job_server, inputs, equation, funs, packages, Progress=True):
num_inputs = len(inputs) / 100 + 1
jobs = [job_server.submit(equation, pars, funs, packages) for pars in inputs]
return [job() for job in jobs]
def ssCacl(inputs, equation):
ps = []
for i, (X, n, m, N) in enumerate(inputs):
ps.append(equation(X, n, m, N))
return ps
def getPairedSeqNames1(Motifs):
SeqNames = getMotif_SeqName(Motifs)
MotifPairs = set(getPairedList(Motifs.keys()))
num_MotifPairs = len(MotifPairs)
print "%s pairs to go" % num_MotifPairs
num_MotifPairs = num_MotifPairs / 100 + 1
PairedMotifs = {}
for i, (uidA, uidB) in enumerate(MotifPairs):
intersect = list(SeqNames[uidA] & SeqNames[uidB])
if intersect:
PosA = Motifs[uidA]
PosB = Motifs[uidB]
sys.stderr.write("Progress:%d%%\t%s\t%s\r" % (i / num_MotifPairs, uidA, uidB))
positions = [(PosA[seqname], PosB[seqname], 10, 250) for seqname in intersect]
distances = ppCacl(job_server, positions, getDist, (is_overlap, caclDist), (), False)
distances = dict([(intersect[i], d) for i, d in enumerate(distances) if d])
if distances:
PairedMotifs[(uidA, uidB)] = distances
return PairedMotifs
def getPairedSeqNames2(Motifs):
SeqNames = getMotif_SeqName(Motifs)
MotifPairs = set(getPairedList(Motifs.keys()))
num_MotifPairs = len(MotifPairs)
print "%s pairs to go" % num_MotifPairs
num_MotifPairs = num_MotifPairs / 100 + 1
PairedMotifs = {}
for i, (uidA, uidB) in enumerate(MotifPairs):
intersect = list(SeqNames[uidA] & SeqNames[uidB])
if intersect:
PosA = Motifs[uidA]
PosB = Motifs[uidB]
sys.stderr.write("Progress:%d%%\t%s\t%s\r" % (i / num_MotifPairs, uidA, uidB))
positions = [(PosA[seqname], PosB[seqname], 10, 250) for seqname in intersect]
distances = ssCacl(positions, getDist)
distances = dict([(intersect[i], d) for i, d in enumerate(distances) if d])
if distances:
PairedMotifs[(uidA, uidB)] = distances
return PairedMotifs
def getPairedSeqNames3(Motifs):
SeqNames = getMotif_SeqName(Motifs)
MotifPairs = set(getPairedList(Motifs.keys()))
num_MotifPairs = len(MotifPairs)
print "%s pairs to go" % num_MotifPairs
num_MotifPairs = num_MotifPairs / 100 + 1
PairedMotifs = {}
positions = []
for i, (uidA, uidB) in enumerate(MotifPairs):
intersect = list(SeqNames[uidA] & SeqNames[uidB])
if intersect:
PosA = Motifs[uidA]
PosB = Motifs[uidB]
sys.stderr.write("Progress:%d%%\t%s\t%s\r" % (i / num_MotifPairs, uidA, uidB))
positions.extend([(uidA, uidB, seqname, PosA[seqname], PosB[seqname], 10, 250) for seqname in intersect])
distances = ppCacl(job_server, positions, getDist2, (is_overlap, caclDist), (), False)
for (uidA, uidB, seqname, CoDist) in distances:
if CoDist:
if not PairedMotifs.has_key((uidA, uidB)):
PairedMotifs[(uidA, uidB)] = {}
PairedMotifs[(uidA, uidB)][seqname] = CoDist
return PairedMotifs
def genMotifs(n_seq=5000, n_pos=10):
digits = range(1, 60000)
Motifs = {}
uids = random.sample(digits, 50)
for uid in uids:
seqnames = random.sample(digits, random.randint(0, n_seq))
Motifs[uid] = {}
for seqname in seqnames:
Motifs[uid][seqname] = genPos(random.randint(0, n_pos))
return Motifs
def genPos(n):
return [(random.randint(0, 3000),random.randint(0, 3000)) for i in xrange(0,n)]
job_server = pp.Server()
Motifs = genMotifs()
timestamp = time.time()
getPairedSeqNames1(Motifs)
print time.time() - timestamp
timestamp = time.time()
getPairedSeqNames2(Motifs)
print time.time() - timestamp
timestamp = time.time()
getPairedSeqNames3(Motifs)
print time.time() - timestamp
Motifs = genMotifs(50, 2000)
timestamp = time.time()
getPairedSeqNames1(Motifs)
print time.time() - timestamp
timestamp = time.time()
getPairedSeqNames2(Motifs)
print time.time() - timestamp
timestamp = time.time()
getPairedSeqNames3(Motifs)
print time.time() - timestamp
私のコンピューターでの結果:
1225 pairs to go
57.377081871 16666 20431
1225 pairs to go
15.1005380154 16666 20431
1225 pairs to go
59.9019329548 16666 20431
1225 pairs to go
43.1178700924 11721 46015
1225 pairs to go
77.7199709415 11721 46015
1225 pairs to go
10.1687381268 11721 46015
getPairedSeqNames3 の cProfile n_seq=5000 n_pos=10
getPairedSeqNames3 の cProfile n_seq=10 n_pos=5000
getPairedSeqNames3 の cProfile n_seq=20 n_pos=2500