c++ - C ++ソートされたベクトルをソートされたベクトルにマージする/それらすべてから最小の要素をポップする方法は？

Question

私は約100個ほどのソートされたコレクションを持ってvector<int>いますほとんどのベクトルには少数の整数が含まれていますが、一部のベクトルにはそれらの大きな（> 10K）が含まれています（したがって、ベクトルは必ずしも同じサイズである必要はありません））。

私がやりたいことは、基本的に、これらすべてのソートされたベクトルに含まれている最小の整数から最大の整数まで反復します。

これを行う1つの方法は、これらすべてのソートされたベクトルをソートされたベクトルにマージし、単純に反復することです。したがって、

質問1：ソートされたベクトルをソートされたベクトルにマージする最速の方法は何ですか？

一方、全体をマージして再ソートすることなく、これを実現するためのより高速で賢い方法があると確信しています。おそらく、このソートされたベクトルのコレクションから最小の整数を繰り返しポップします。それらを最初にマージせずに..そう：

質問2：ソートされたものの束から最小の要素をポップするための断食/最良の方法は何vector<int>ですか？

以下の回答と質問へのコメントに基づいて、ソートされたベクトルのイテレーターの優先キューを作成するアプローチを実装しました。これがパフォーマンス効率に優れているかどうかはわかりませんが、メモリ効率は非常に高いようです。まだ最速の方法を確立したかどうかわからないので、質問はまだ開いていると思います。

// compare vector pointers by integers pointed
struct cmp_seeds {
    bool operator () (const pair< vector<int>::iterator, vector<int>::iterator> p1, const pair< vector<int>::iterator, vector<int>::iterator> p2) const {
        return *(p1.first) >  *(p2.first);      
    }
};

int pq_heapsort_trial() {

    /* Set up the Sorted Vectors */ 
    int a1[] = { 2, 10, 100};
    int a2[] = { 5, 15, 90, 200};
    int a3[] = { 12 };

    vector<int> v1 (a1, a1 + sizeof(a1) / sizeof(int));
    vector<int> v2 (a2, a2 + sizeof(a2) / sizeof(int));
    vector<int> v3 (a3, a3 + sizeof(a3) / sizeof(int));

    vector< vector <int> * > sorted_vectors;
    sorted_vectors.push_back(&v1);
    sorted_vectors.push_back(&v2);
    sorted_vectors.push_back(&v3);
    /* the above simulates the "for" i have in my own code that gives me sorted vectors */

    pair< vector<int>::iterator, vector<int>::iterator> c_lead;
    cmp_seeds mycompare;

    priority_queue< pair< vector<int>::iterator, vector<int>::iterator>, vector<pair< vector<int>::iterator, vector<int>::iterator> >, cmp_seeds> cluster_feeder(mycompare);


    for (vector<vector <int> *>::iterator k = sorted_vectors.begin(); k != sorted_vectors.end(); ++k) {
        cluster_feeder.push( make_pair( (*k)->begin(), (*k)->end() ));
    }


    while ( cluster_feeder.empty() != true) {
        c_lead = cluster_feeder.top();
        cluster_feeder.pop();
        // sorted output
        cout << *(c_lead.first) << endl;

        c_lead.first++;
        if (c_lead.first != c_lead.second) {
            cluster_feeder.push(c_lead);
        }
    }

    return 0;
}

score 4 · Accepted Answer

1つのオプションは、を使用しstd :: priority queueてイテレーターのヒープを維持することです。イテレーターは、ポイントする値に応じてヒープをバブルアップします。

の繰り返しアプリケーションの使用を検討することもできますstd :: inplace_merge。これには、すべてのデータを一緒に大きなベクトルに追加し、ソートされた各ブロックが開始および終了するオフセットを記憶してから、それらをinplace_mergeに渡すことが含まれます。基本的に複雑さは同等だと思いますが、これはおそらくヒープソリューションよりも高速です。

更新：今説明した2番目のアルゴリズムを実装しました。その場でマージソートを繰り返し実行します。このコードはideoneにあります。

これは、最初にすべてのソートされたリストを1つの長いリストに連結することによって機能します。ソースリストが3つある場合、これは4つの「オフセット」があることを意味します。これは、要素が並べ替えられる完全なリストの4つのポイントです。次に、アルゴリズムはこれらのうち3つを一度にプルオフし、対応する2つの隣接するソート済みリストを1つのソート済みリストにマージし、new_offsetsで使用される3つのオフセットのうち2つを記憶します。

これはループで繰り返され、ソートされた範囲が1つだけ残るまで、隣接するソートされた範囲のペアがマージされます。

最終的に、最良のアルゴリズムは、隣接する範囲の最短のペアを最初にマージすることを含むと思います。

// http://stackoverflow.com/questions/9013485/c-how-to-merge-sorted-vectors-into-a-sorted-vector-pop-the-least-element-fro/9048857#9048857
#include <iostream>
#include <vector>
#include <algorithm>
#include <cassert>
using namespace std;

template<typename T, size_t N>
vector<T> array_to_vector( T(*array)[N] ) { // Yes, this works. By passing in the *address* of
                                            // the array, all the type information, including the
                                            // length of the array, is known at compiler. 
        vector<T> v( *array, &((*array)[N]));
        return v;
}   

void merge_sort_many_vectors() {

    /* Set up the Sorted Vectors */ 
    int a1[] = { 2, 10, 100};
    int a2[] = { 5, 15, 90, 200};
    int a3[] = { 12 };

    vector<int> v1  = array_to_vector(&a1);
    vector<int> v2  = array_to_vector(&a2);
    vector<int> v3  = array_to_vector(&a3);


    vector<int> full_vector;
    vector<size_t> offsets;
    offsets.push_back(0);

    full_vector.insert(full_vector.end(), v1.begin(), v1.end());
    offsets.push_back(full_vector.size());
    full_vector.insert(full_vector.end(), v2.begin(), v2.end());
    offsets.push_back(full_vector.size());
    full_vector.insert(full_vector.end(), v3.begin(), v3.end());
    offsets.push_back(full_vector.size());

    assert(full_vector.size() == v1.size() + v2.size() + v3.size());

    cout << "before:\t";
    for(vector<int>::const_iterator v = full_vector.begin(); v != full_vector.end(); ++v) {
            cout << ", " << *v;
    }       
    cout << endl;
    while(offsets.size()>2) {
            assert(offsets.back() == full_vector.size());
            assert(offsets.front() == 0);
            vector<size_t> new_offsets;
            size_t x = 0;
            while(x+2 < offsets.size()) {
                    // mergesort (offsets[x],offsets[x+1]) and (offsets[x+1],offsets[x+2])
                    inplace_merge(&full_vector.at(offsets.at(x))
                                 ,&full_vector.at(offsets.at(x+1))
                                 ,&(full_vector[offsets.at(x+2)]) // this *might* be at the end
                                 );
                    // now they are sorted, we just put offsets[x] and offsets[x+2] into the new offsets.
                    // offsets[x+1] is not relevant any more
                    new_offsets.push_back(offsets.at(x));
                    new_offsets.push_back(offsets.at(x+2));
                    x += 2;
            }
            // if the number of offsets was odd, there might be a dangling offset
            // which we must remember to include in the new_offsets
            if(x+2==offsets.size()) {
                    new_offsets.push_back(offsets.at(x+1));
            }
            // assert(new_offsets.front() == 0);
            assert(new_offsets.back() == full_vector.size());
            offsets.swap(new_offsets);

    }
    cout << "after: \t";
    for(vector<int>::const_iterator v = full_vector.begin(); v != full_vector.end(); ++v) {
            cout << ", " << *v;
    }
    cout << endl;
}

int main() {
        merge_sort_many_vectors();
}

score 2 · Accepted Answer

最初に頭に浮かぶのは、各ベクトルへのイテレータを含むヒープ構造を、それらが現在指している値の順に並べて作成することです。（もちろん、各エントリにはエンドイテレータも含まれている必要があります）

現在の要素はヒープのルートにあり、続行するには、要素をポップするか、キーを増やすだけです。（後者は、ポップ、インクリメント、プッシュすることで実行できます）

これは漸近的な複雑さを持つべきだと思いますO(E log M)。ここEで、は要素の総数であり、Mはベクトルの数です。

ベクトルからすべてを実際にポップしている場合は、ベクトルへのポインタのヒープを作成できます。ベクトルの先頭から消去することによるパフォーマンスの低下を回避するために、それらもヒープとして処理することをお勧めします。deque（または、最初にすべてをsにコピーできます）

一度にペアをマージすることによってそれらをすべて一緒にマージすることは、順序に注意する場合、同じ漸近的な複雑さを持ちます。すべてのベクトルを完全でバランスの取れた二分木に配置し、ツリーを上るときにペアごとにマージすると、各要素が何度もコピーされ、アルゴリズムlog Mにつながります。O(E log M)

実際の効率を高めるために、ツリーの代わりに、残りが1つになるまで、最小の2つのベクトルを繰り返しマージする必要があります。（繰り返しになりますが、ベクトルへのポインターをヒープに配置するのが方法ですが、今回は長さ順に並べられています）

（実際には、長さではなく「コピーコスト」で注文する必要があります。特定の値タイプ用に最適化するための追加事項）

推測する必要がある場合、最も速い方法は2番目のアイデアを使用することですが、ペアワイズマージの代わりにN-aryマージを使用して、適切なN（小さな定数、または大まかにベクトル数の平方根）、上記の最初のアルゴリズムを使用してN-aryマージを実行し、N個のベクトルの内容を一度に列挙します。

score 0 · Accepted Answer

ここで示したアルゴリズムを使用して、少し抽象化しました。テンプレートへの変換。このバージョンをVS2010でコーディングし、ファンクターの代わりにラムダ関数を使用しました。これが以前のバージョンよりも「優れている」かどうかはわかりませんが、誰かが役立つかもしれません。

#include <queue>
#include <vector>

namespace priority_queue_sort
{
    using std::priority_queue;
    using std::pair;
    using std::make_pair;
    using std::vector;

    template<typename T>
    void value_vectors(const vector< vector <T> * >& input_sorted_vectors, vector<T> &output_vector)
    {
        typedef vector<T>::iterator iter;
        typedef pair<iter, iter>    iter_pair;

        static auto greater_than_lambda = [](const iter_pair& p1, const iter_pair& p2) -> bool { return *(p1.first) >  *(p2.first); };

        priority_queue<iter_pair, std::vector<iter_pair>, decltype(greater_than_lambda) > cluster_feeder(greater_than_lambda);

        size_t total_size(0);

        for (auto k = input_sorted_vectors.begin(); k != input_sorted_vectors.end(); ++k)
        {
            cluster_feeder.push( make_pair( (*k)->begin(), (*k)->end() ) );
            total_size += (*k)->size();
        }

        output_vector.resize(total_size);
        total_size = 0;
        iter_pair c_lead;
        while (cluster_feeder.empty() != true)
        {
            c_lead = cluster_feeder.top();
            cluster_feeder.pop();
            output_vector[total_size++] = *(c_lead.first);
            c_lead.first++;
            if (c_lead.first != c_lead.second) cluster_feeder.push(c_lead);
        }
    }

    template<typename U, typename V>
    void pair_vectors(const vector< vector < pair<U, V> > * >& input_sorted_vectors, vector< pair<U, V> > &output_vector)
    {
        typedef vector< pair<U, V> >::iterator iter;
        typedef pair<iter, iter> iter_pair;

        static auto greater_than_lambda = [](const iter_pair& p1, const iter_pair& p2) -> bool { return *(p1.first) >  *(p2.first); };

        priority_queue<iter_pair, std::vector<iter_pair>, decltype(greater_than_lambda) > cluster_feeder(greater_than_lambda);

        size_t total_size(0);

        for (auto k = input_sorted_vectors.begin(); k != input_sorted_vectors.end(); ++k)
        {
            cluster_feeder.push( make_pair( (*k)->begin(), (*k)->end() ) );
            total_size += (*k)->size();
        }

        output_vector.resize(total_size);
        total_size = 0;
        iter_pair c_lead;

        while (cluster_feeder.empty() != true)
        {
            c_lead = cluster_feeder.top();
            cluster_feeder.pop();
            output_vector[total_size++] = *(c_lead.first);  
            c_lead.first++;
            if (c_lead.first != c_lead.second) cluster_feeder.push(c_lead);
        }
    }
}

アルゴリズムpriority_queue_sort::value_vectorsは、値のみを含むベクトルをソートします。一方priority_queue_sort::pair_vectors、最初のデータ要素に従って、データのペアを含むベクトルを並べ替えます。誰かがいつかこれを使用できることを願っています:-)

c++ - C ++ソートされたベクトルをソートされたベクトルにマージする/それらすべてから最小の要素をポップする方法は？

3 に答える 3

Related

Reference