c++ - 文字列のパターンを見つけるより良い解決策は?

Question

文字列のパターンを見つけて比較する最適な方法を見つけようとしています。たとえば、s1 = "赤青青赤赤黄"、および s2 = " abbaac " があります。同じパターンなので一致します。

これを行う私の考えは、s1 と s2 を反復処理し、ベクトルコンテナーを使用して対応する場所のカウントを記録し (s1 は対応する単語のカウントになり、s2 は対応する文字のカウントになるため)、比較します。

これは、s1 と s2 全体を反復処理するため、非常に非効率的です。s1 = " red blue red red red red yellow " および s2 = " abbaac " の場合。3 番目のredの後、それを反復し続ける意味は本質的にありません。

それで、これを行う方法について何か良いアイデアはありますか？

コード：

#include "stdafx.h"
#include <iostream>
#include <string>
#include <array>
#include <sstream>
#include <vector>
#include <algorithm>
using namespace std;

vector<int> findPattern(string pattern){
    vector<int> counts;
    for (int i = 0; i < pattern.size(); ++i){
        counts.push_back(0);
        int counter = 0;
        for (int j = i + 1; j < pattern.size(); ++j){
            if (pattern[i] == pattern[j]){
                ++counter;              
            }   
            counts[i] = counter;    
        }
    }
    return counts;
}

vector<int> findPatternLong(string pattern){
    istringstream iss (pattern);
    string word;
    vector<string> v;
    while (iss >> word){
        v.push_back(word);
    }
    vector<int> counts2;
    for (int i = 0; i < v.size(); ++i){
        counts2.push_back(0);
        int counter = 0;
        for (int j = i + 1; j < v.size(); ++j){
            if (v[i] == v[j]){
                ++counter;
            }
            counts2[i] = counter;
        }
    }
    return counts2;
}

int main(int argc, char * argv[]){
    vector<int> v1 = findPattern("abbaac");
    vector<int> v2 = findPatternLong("red blue blue red red yellow");
    if (v1.size() == v2.size()){
        for (int i = 0; i < v1.size(); ++i){
            if (v1[i] != v2[i]){
                cout << "Unmatch" << endl;
                return false;
            }
        }
        cout << "match" << endl;
        return true;
    } else 
        cout << "Unmatch" << endl; 
    return 0;
}

score 2 · Accepted Answer

@トニーは同じ考えで私を打ち負かしましたが、私はすでにこれを入力したので、ここに行きます:-)

まず第一に、効率性についてあまり心配せず、正確さに集中してください。実際、時期尚早の最適化は諸悪の根源です。テストケースを作成し、コードがそれぞれに合格することを確認します。

次に、マップ/辞書 D から始めて、各文字列の 1 つの要素を解析するループがあると思います (s1 の単語を「w」と呼び、s2 の文字を「 c")、キーとして 1 つの要素 ("c" 文字など) を選択し、"c" が辞書に既にエントリがあるかどうかを確認します。

同時に要素を使い果たした場合、文字列は一致します
一方の要素が不足している場合、一致するものがないことがわかります
"c" が D にエントリを持たない場合は、現在の値を保存します: D[c] = w;
それ以外の場合、「c」に既にエントリがある場合は、エントリが文字列で見つかった値と一致するかどうかを確認します: is D[c] == w? そうでない場合は、一致しないことがわかります

そのコードが機能する場合、最適化を開始できます。あなたの例では、ASCII 文字は小さな有限セットであるため、辞書の代わりに単純な配列を使用できます。

score 1 · Accepted Answer

これは最も効率的なコードではありませんが、最も単純に近いコードです。

std::map<char, std::string> letter_to_word;
std::set<std::string> words_seen;
std::istringstream iss(s1);
std::string word;
for (std::string::size_t i = 0; i < s2.size(); ++i)
{
    if (!(iss >> word))
        return false; // more letters than words
    std::string& expected_word = letter_to_word[s2[i]];
    if (expected_word == "")
    {
        // if different letters require different words...
        if (words_seen.find(word) != words_seen.end())
            return false; // multiple letters for same word
        words_seen.insert(word);

        expected_word = word; // first time we've seen letter, remember associated word
    }
    else if (expected_word != word)
        return false; // different word for same letter
}
return !(iss >> word); // check no surplus words

score 0 · Accepted Answer

編集

質問には文字列のパターンがあり、この回答はさまざまなタイプのコレクションの比較に関係していることを読みました。2つの入力文字列が最初に変換された場合、答えにはまだ少し水が残っていると思います:)

これが最も効率的な解決策とは言えませんが、拡張性が高いのは気に入っています。

まず、PatternResultクラスがあります。パターンの結果を保存します。

class PatternResult {
private:
    std::vector<int> result_;

public:
    PatternResult(const std::vector<int>& result) : result_(result) {
    };

    bool operator == (const PatternResult& rhs) const {
        if(result_.size() != rhs.result_.size()) 
            return false;
        else {
            for(std::vector<int>::size_type r(0);
                r < result_.size();
                ++r) {
                if(result_[r] != rhs.result_[r])
                    return false;
            };
            return true;
        };
    };
};  // eo class PatternResult

それは整数のベクトルを取り、その値はその値を示します。==2つのパターン結果を比較するためにオーバーロードします。つまり、ソースデータに関係なく同じシーケンスになります。

次に、同じシーケンス番号を割り当てることができるが、任意のタイプをとることができるパターンカウンターが必要です。

template<class T>
class PatternCounter {
private:
    typedef std::vector<T> vec_type;
    typedef std::map<T, int> map_type;
    map_type found_;
    int counter_;
public:
    PatternCounter() : counter_(1) {
    };

    PatternResult count(const vec_type& input ){
        std::vector<int> ret;
        for(vec_type::const_iterator cit(input.begin());
            cit != input.end();
            ++cit) {
            if(found_.find(*cit) != found_.end()) {
                ret.push_back(found_[*cit]);
            } else {
                found_[*cit] = counter_;
                ret.push_back(counter_);
                ++counter_;
            };
        };
        return PatternResult(ret);
    };
};

これで完了です。テストコード：

std::vector<std::string> inp1;
inp1.push_back("red");
inp1.push_back("blue");
inp1.push_back("blue");
inp1.push_back("red");
inp1.push_back("yellow");

std::vector<char> inp2;
inp2.push_back('a');
inp2.push_back('b');
inp2.push_back('b');
inp2.push_back('a');
inp2.push_back('c');

PatternCounter<std::string> counter1;
PatternCounter<char> counter2;

PatternResult res1(counter1.count(inp1));
PatternResult res2(counter2.count(inp2));

if(res1 == res2) {
        // pattern sequences are equal
};

これは迅速で汚いことに注意してください。もっと効率的にすることができると確信しています。

score 0 · Accepted Answer

基本的に、シーケンスが同じ順序に従っていることを確認する必要があります。シーケンスが実際にどのようなものであるかは気にしません。最初の 2 番目の最初の 3 分の 1 で十分です。これは、何らかの方法で文字列を int にマップするコンテナーで行うことができます。ただし、各文字列のコピーを保存することになり、文字列値をあまり気にしないという事実を無視しています。小さなテストケースの場合、これは問題になりませんが、長い単語が大量に続く場合は、必要のないときにすぐにメモリを使い果たしてしまいます。

それでは、文字列値やそれらの保存について気にしないという事実を利用しましょう。その場合、ハッシュ関数を使用して文字列を単純な size_t 値に変換し、文字列が一意であることをかなり強力に保証できます。ただし、ハッシュはシーケンシャルではないため、ハッシュ値に基づいてシーケンスを取得する必要があります。それらのシーケンスを記録する最も簡単な方法は、簡単に検索できるようにマップのサイズにマップすることです。パズルの最後のピースは、ハッシュが同じ順序であることを確認することです。

また、文を単語と比較するだけでなく、2 つの単語または 2 つの文を比較したい場合もあると思います。基本的に上記を実行し、必要でない限りメモリに何も保持しない C++11 の簡単なサンプルを次に示します。

もちろん、これはさらに最適化することができます-たとえば、物事を並行して実行します。

#include <iostream>
#include <vector>
#include <string>
#include <map>
#include <sstream>
/*
s1 = "red blue blue red red yellow"
s2 = "abbaac"
This would match because they have the same pattern.
*/
typedef std::map<size_t,size_t> hash_map;
typedef std::vector<std::string> wordlist;

size_t ordered_symbol( hash_map &h, std::string const& word )
{
    std::hash<std::string> hash_fn;
    size_t hash = hash_fn(word);
    if(h.find(hash)==h.end())
    {
        size_t const sequence = h.size();
        h[hash] = sequence;
        return sequence;
    }
    return h[hash];
}

wordlist create_wordlist( std::string const& str )
{
    if(str.find_first_of(' ') != std::string::npos)
    {
        wordlist w1;
        std::stringstream sstr(str);
        std::string s;
        while(sstr>>s)
            w1.push_back(s);
        return w1;        
    }
    wordlist w2;
    for(auto i : str)
    {
        std::string s;
        s.append(1,i);
        w2.push_back(s);
    }
    return w2;
}

bool pattern_matches( std::string const& s1, std::string const& s2 )
{
    wordlist const w1 = create_wordlist(s1);
    wordlist const w2 = create_wordlist(s2);
    if(w1.size()!=w2.size())
        return false;
    hash_map h1,h2;
    for( size_t i = 0; i!=w1.size(); ++i)
        if(ordered_symbol(h1,w1[i])!=ordered_symbol(h2,w2[i]))
            return false;
    return true;
}

void test( std::string const& s1, std::string const& s2 )
{
    std::cout<<"["<<s1<<"] "
             <<(pattern_matches(s1,s2)? "<==>" : "<=!=>")
             <<"["<<s2<<"]\n";    
}

int main()
{
    test("red blue blue red red yellow","abbaac");
    test("red blue blue red red yellow","first second second first first third");
    test("abbaac","12211g");
    test("abbaac","red blue blue red red yellow");
    test("abbgac","red blue blue red red yellow");
    return 0;
}

//Output:
//[red blue blue red red yellow] <==>[abbaac]
//[red blue blue red red yellow] <==>[first second second first first third]
//[abbaac] <==>[12211g]
//[abbaac] <==>[red blue blue red red yellow]
//[abbgac] <=!=>[red blue blue red red yellow]

編集:これは、VS2010 で動作するC++11 以外のバージョンです。ただし、C++03 には標準ライブラリに文字列ハッシュ関数が含まれていないため、この例ではスタックオーバーフローから取得したハッシュ関数を使用しています。ブーストライブラリにアクセスできる場合、使用するより優れたハッシュ関数はこれです。

score 0 · Accepted Answer

2 つのベクトルは必要ありません。

2 番目の文字列を処理するときは、最初のパターンのカウントを最初のエントリと比較します。一致する場合は続行し、そうでない場合は停止します。2 番目の文字列の残りのパターンについて繰り返します。

2 番目の文字列のパターンカウントを保存する必要はありません。

c++ - 文字列のパターンを見つけるより良い解決策は?

5 に答える 5

Related

Reference