c++ - 引用符付き文字列を 1 つのトークンとして扱うブーストトークナイザー

Question

引用部分を分割せずに、Boost トークナイザーを文字列の下に分割する方法はありますか?

string s = "1st 2nd \"3rd with some comment\" 4th";

Exptected output:
1st
2nd
3rd with some comment
4th

score 3 · Accepted Answer

You can use an escaped_list_separator from the tokenizer library. See this question for more details on how to apply it to your problem.

score 2 · Accepted Answer

C++11 solution

#include <iostream>
#include <string>
#include <vector>

std::vector<std::string> tokenize(const std::string& str) {
    std::vector<std::string> tokens;
    std::string buffer;
    std::string::const_iterator iter = str.cbegin();

    bool in_string = false;

    while (iter != str.cend()) {
        char c = *iter;
        if (c == '"') {
            if (in_string) {
                tokens.push_back(buffer);
                buffer.clear();
            }
            in_string = !in_string;
        } else if (c == ' ') {
            if (in_string) {
                buffer.push_back(c);
            } else {
                if (!buffer.empty()) {
                    tokens.push_back(buffer);
                    buffer.clear();
                }
            }
        } else {
            buffer.push_back(c);
        }

        ++iter;
    }

    if (!buffer.empty()) {
        tokens.push_back(buffer);
    }

    return tokens;
}

int main() {
    std::string s = "1st 2nd \"3rd with some comment\" 4th";
    std::vector<std::string> tokens = tokenize(s);
    for (auto iter = tokens.cbegin(); iter != tokens.cend(); ++iter) {
        std::cout << *iter << "\n";
    }
}

score 1 · Accepted Answer

このコードを試すと、Boost.Tokenizer と Boost.Spirit ライブラリの使用を避けることができます。

#include <vector>
#include <string>
#include <iostream>

const char Separators[] = { ' ', 9 };

bool Str_IsSeparator( const char Ch )
{
    for ( size_t i = 0; i != sizeof( Separators ); i++ )
    {
        if ( Separators[i] == Ch ) { return true; }
    }

    return false;
}

void SplitLine( size_t FromToken, size_t ToToken, const std::string& Str, std::vector<std::string>& Components /*, bool ShouldTrimSpaces*/ )
{
    size_t TokenNum = 0;
    size_t Offset   = FromToken - 1;

    const char* CStr  = Str.c_str();
    const char* CStrj = Str.c_str();

    while ( *CStr )
    {
        // bypass spaces & delimiting chars
        while ( *CStr && Str_IsSeparator( *CStr ) ) { CStr++; }

        if ( !*CStr ) { return; }

        bool InsideQuotes = ( *CStr == '\"' );

        if ( InsideQuotes )
        {
            for ( CStrj = ++CStr; *CStrj && *CStrj != '\"'; CStrj++ );
        }
        else
        {
            for ( CStrj = CStr; *CStrj && !Str_IsSeparator( *CStrj ); CStrj++ );
        }

        // extract token
        if ( CStr != CStrj )
        {
            TokenNum++;

            // store each token found
            if ( TokenNum >= FromToken )
            {
                  Components[ TokenNum-Offset ].assign( CStr, CStrj );
                  // if ( ShouldTrimSpaces ) { Str_TrimSpaces( &Components[ TokenNum-Offset ] ); }
                  // proceed to next token
                  if ( TokenNum >= ToToken ) { return; }
            }
            CStr = CStrj;

            // exclude last " from token, handle EOL
            if ( *CStr ) { CStr++; }
        }
    }
}

int main()
{
    std::string test = "1st 2nd \"3rd with some comment\" 4th";
    std::vector<std::string> Out;

    Out.resize(5);
    SplitLine(1, 4, test, Out);

    for(size_t j = 0 ; j != Out.size() ; j++) { std::cout << Out[j] << std::endl; }

    return 0;
}

事前に割り当てられた文字列配列を使用し (ゼロベースではありませんが、簡単に修正できます)、非常に単純です。

c++ - 引用符付き文字列を 1 つのトークンとして扱うブースト トークナイザー

3 に答える 3

Related

Reference

c++ - 引用符付き文字列を 1 つのトークンとして扱うブーストトークナイザー