python - Python は random.sample のパフォーマンスを向上させます

Question

辞書に格納されている要素をランダムに選択する関数を作成しています:

import random
from liblas import file as lasfile
from collections import defaultdict

def point_random_selection(list,k):
    try:
        sample_point = random.sample(list,k)
    except ValueError:
        sample_point = list
    return(sample_point)

def world2Pixel_Id(x,y,X_Min,Y_Max,xDist,yDist):
    col = int((x - X_Min)/xDist)
    row = int((Y_Max - y)/yDist)
    return("{0}_{1}".format(col,row))

def point_GridGroups(inFile,X_Min,Y_Max,xDist,yDist):
    Groups = defaultdict(list)
    for p in lasfile.File(inFile,None,'r'):
        id = world2Pixel_Id(p.x,p.y,X_Min,Y_Max,xDist,yDist)
        Groups[id].append(p)
    return(Groups)

ここで、k は選択する要素の数です。グループは辞書です

file_out = lasfile.File("outPut",mode='w',header= h)
for m in Groups.iteritems():
   # select k point for each dictionary key 
   point_selected = point_random_selection(m[1],k)
   for l in xrange(len(point_selected)):
     # save the data 
     file_out.write(point_selected[l])
file_out.close()

私の問題は、このアプローチが非常に遅いことです（〜800 Mbのファイルの場合、約4日）

score 1 · Accepted Answer

座標を読みながら、サンプルを更新してみることができます。これにより、少なくとも、サンプルを実行する前にすべてをメモリに保存する必要がなくなります。これは高速化を保証するものではありません。

以下は、すべての行を保持せずにファイル入力からランダムなサンプルを作成するというBlkKnght の優れた回答に基づいています。これにより、代わりに複数のサンプルを保持するように拡張されました。

import random
from liblas import file as lasfile
from collections import defaultdict


def world2Pixel_Id(x, y, X_Min, Y_Max, xDist, yDist):
    col = int((x - X_Min) / xDist)
    row = int((Y_Max - y) / yDist)
    return (col, row)

def random_grouped_samples(infile, n, X_Min, Y_Max, xDist, yDist):
    """Select up to n points *per group* from infile"""

    groupcounts = defaultdict(int)
    samples = defaultdict(list)

    for p in lasfile.File(inFile, None, 'r'):
        id = world2Pixel_Id(p.x, p.y, X_Min, Y_Max, xDist, yDist)
        i = groupcounts[id]
        r = random.randint(0, i)

        if r < n:
            if i < n:
                samples[id].insert(r, p)  # add first n items in random order
            else:
                samples[id][r] = p  # at a decreasing rate, replace random items

        groupcounts[id] += 1

    return samples

上記の関数はinFile、境界座標とサンプルサイズを取得し、各グループに最大でもアイテムが含まれるグループ化されたサンプルを均一に選択しnて返します。n

for はグループキーとして使用するだけなので、タプルをid計算するだけに減らしましたcol, row。文字列にする必要はありません。

これらを次のようにファイルに書き出すことができます。

file_out = lasfile.File("outPut",mode='w',header= h)

for group in samples.itervalues():
    for p in group:
        file_out.write(p)

file_out.close()

python - Python は random.sample のパフォーマンスを向上させます

1 に答える 1

Related

Reference