c++ - boost::spirit::lex トークンの認識方法

Question

私はboost::spiritの使い方を学んでいます。そのために、単純なレクサーを作成し、それらを組み合わせてから、spirit を使用して解析を開始したいと考えました。しかし、結果はかなり混乱しています:

レクサーは次のとおりです。

// #define BOOST_SPIRIT_LEXERTL_DEBUG
#define BOOST_VARIANT_MINIMIZE_SIZE

#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <boost/spirit/include/phoenix_statement.hpp>
#include <boost/spirit/include/phoenix_container.hpp>


#include <iostream>
#include <string>

using namespace boost::spirit;
using namespace boost::spirit::ascii;


enum tokenids
{
  IDANY = lex::min_token_id + 10,
  T_USER,
  T_DOMAINLABEL,
  T_CRLF
};


template <typename Lexer>
struct sip_token : lex::lexer<Lexer>
{
  sip_token()
  {
    this->self.add_pattern
      ("ALPHANUM", "[0-9a-zA-Z]")
      ("MARK", "[-_.!~*'()]")           
      ("UNRESERVED","{ALPHANUM}|{MARK}")            
      ("USER", "({UNRESERVED})+" ) 
      ("DOMAINLABEL", "({ALPHANUM})+")
      // ("DOMAINLABEL", "{ALPHANUM}|({ALPHANUM}({ALPHANUM}|-)*{ALPHANUM})") 
      ;     

    this->self.add
      ("{USER}",T_USER)
      ("{DOMAINLABEL}", T_DOMAINLABEL)          
      ("\r\n", T_CRLF)
      (".", IDANY)    // string literals will not be esacped by the library
      ;
  } 
};


template <typename Iterator>
struct sip_grammar : qi::grammar<Iterator>
// struct sip_grammar : qi::grammar<Iterator>
{
  template <typename TokenDef>
  sip_grammar(TokenDef const& tok)
    : sip_grammar::base_type(start)
    , c(0), w(0), l(0)
  {
    using boost::phoenix::ref;
    using boost::phoenix::size;
    using boost::spirit::qi::eol;


    start =  (      
      (qi::token(T_DOMAINLABEL))[++ref(c), ++ref(l)]
      >>   qi::token(T_CRLF) [++ref(w)]
      ) 
      ;
  }

  std::size_t c, w, l;
  qi::rule<Iterator> start; 
};



int main(int argc, char* argv[])
{
  typedef lex::lexertl::token<
  char const*, boost::mpl::vector<std::string>
  > token_type;

  typedef std::string::const_iterator str_iterator_type;
  typedef lex::lexertl::lexer<token_type> lexer_type;
  typedef sip_token<lexer_type>::iterator_type iterator_type;

  std::string str;
  while (std::getline(std::cin, str))
  {
    if (str.empty() || str[0] == 'q' || str[0] == 'Q')
      break;        
    else
      str += "\r\n";

    sip_token<lexer_type> siplexer;
    sip_grammar<iterator_type > g(siplexer);

    char const* first = str.c_str();
    char const* last = &first[str.size()];

    /*<  Parsing is done based on the the token stream, not the character
    stream read from the input. The function `tokenize_and_parse()` wraps
    the passed iterator range `[first, last)` by the lexical analyzer and
    uses its exposed iterators to parse the toke stream.
    >*/  
    unsigned result = 0;
    bool r = lex::tokenize_and_parse(first, last, siplexer, g);     

    if (r) {
      std::cout << "Parsing OK" << g.l << ", " << g.w
        << ", " << g.c << "\n";
    }
    else {
      std::string rest(first, last);
      std::cerr << "Parsing failed\n" << "stopped at: \""
        << rest << "\"\n";
    }

  }
  return 0;
}
//]

コードでは、「T_USER」の後に「T_DOMAINLABEL」を追加します。T_DOMAINLABEL は常に解析に失敗します。レクサーは最初に T_USER と一致するようです。何故ですか？これらの類似したパターンを一緒に追加できないということですか?

score 2 · Accepted Answer

まあ、 T_USER は一致します：

  ("{USER}",T_USER)

  // which is defined as
  ("USER", "({UNRESERVED})+" ) 

  // which is defined as
  ("UNRESERVED","{ALPHANUM}|{MARK}")

したがって、任意の一連の英数字を使用できます (現在は関係のない「マーク」も同様です)。

T_DOMAINLABEL は以下に一致します:

  ("{DOMAINLABEL}", T_DOMAINLABEL)          

  // which is defined as
  ("DOMAINLABEL", "({ALPHANUM})+")

ご覧のとおり、T_DOMAINLABEL トークンは常に有効な T_USER トークンです。したがって、T_DOMAINLABEL を取得する方法はありません。

これは「トークンが一致しない」ためではなく、トークン化が熱心でバックトラッキングを行わない (単一のトークンの外側で)ことの結果です。

c++ - boost::spirit::lex トークンの認識方法

1 に答える 1

Related

Reference