boost - Spirit.Lex を使用して独特の単項マイナス記号を解析する

Question

記号の周りに存在する空白によって、単項マイナスがバイナリマイナスと区別される言語を解析しようとしています。以下は、この言語でマイナス記号がどのように解釈されるかを定義するいくつかの疑似ルールです。

 -x       // unary
 x - y    // binary
 x-y      // binary
 x -y     // unary
 x- y     // binary
 (- y ... // unary

注:最後の規則の開きかっこは、「identifier」、「number」、および「close_paren」を除く言語の任意のトークンに置き換えることができます。

注: 4 番目のケースでは、x は識別子です。識別子は、独自のステートメントを構成できます。そして -y は別のステートメントです。

マイナス記号の型は空白に依存するため、レクサーから 2 つの異なるトークンが返されると考えました。1 つは単項マイナス用、もう 1 つはバイナリマイナス用です。どうすればこれを行うことができますか？

コード:これは私にとってはうまくいくコードですが、十分に堅牢かどうかはよくわかりません。関係のないレクサー規則をすべて削除して、単純化しようとしました。

#ifndef LEXER_H
#define LEXER_H

#include <iostream>
#include <algorithm>
#include <string>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix_function.hpp>
#include <boost/spirit/include/phoenix_algorithm.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <boost/spirit/include/phoenix_object.hpp>
#include <boost/spirit/include/phoenix_statement.hpp>

#define BOOST_SPIRIT_LEXERTL_DEBUG 1

using std::string;
using std::cerr;

namespace skill {

   namespace lex = boost::spirit::lex;
   namespace phoenix = boost::phoenix;

   // base iterator type
   typedef string::iterator BaseIteratorT;

   // token type
   typedef lex::lexertl::token<BaseIteratorT, boost::mpl::vector<int, string> > TokenT;

   // lexer type
   typedef lex::lexertl::actor_lexer<TokenT> LexerT;

   template <typename LexerT>
   struct Tokens: public lex::lexer<LexerT>
   {
      Tokens(const string& input):
         lineNo_(1)
      {
         using lex::_start;
         using lex::_end;
         using lex::_pass;
         using lex::_state;
         using lex::_tokenid;
         using lex::_val;
         using lex::omit;
         using lex::pass_flags;
         using lex::token_def;
         using phoenix::ref;
         using phoenix::count;
         using phoenix::construct;

         // macros
         this->self.add_pattern
            ("EXP",     "(e|E)(\\+|-)?\\d+")
            ("SUFFIX",  "[yzafpnumkKMGTPEZY]")
            ("INTEGER", "-?\\d+")
            ("FLOAT",   "-?(((\\d+)|(\\d*\\.\\d+)|(\\d+\\.\\d*))({EXP}|{SUFFIX})?)")
            ("SYMBOL",  "[a-zA-Z_?@](\\w|\\?|@)*")
            ("STRING",  "\\\"([^\\\"]|\\\\\\\")*\\\"");

         // whitespaces and comments
         whitespaces_ = "\\s+";
         comments_    = "(;[^\\n]*\\n)|(\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/)";

         // literals
         float_   = "{FLOAT}";
         integer_ = "{INTEGER}";
         string_  = "{STRING}";
         symbol_  = "{SYMBOL}";

         // operators
         plus_          = '+';
         difference_    = '-';
         minus_         = "-({SYMBOL}|\\()";

         // ... more operators

         // whitespace
         this->self += whitespaces_
            [
               ref(lineNo_) += count(construct<string>(_start, _end), '\n'),
               _pass = pass_flags::pass_ignore
            ];

         // a minus between two identifiers, numbers or close-open parens is a binary minus, so add spaces around it
         this->self += token_def<omit>("[)a-zA-Z?_0-9]-[(a-zA-Z?_0-9]")
            [
               unput(_start, _end, *_start + construct<string>(" ") + *(_start + 1) + " " + *(_start + 2)),
               _pass = pass_flags::pass_ignore
            ];

         // operators (except for close-brackets) cannot be followed by a binary minus
         this->self += token_def<omit>("['`.+*<>/!~&|({\\[=,:@](\\s+-\\s*|\\s*-\\s+)")
            [
               unput(_start, _end, *_start + construct<string>("-")),
               _pass = pass_flags::pass_ignore
            ];

         // a minus directly preceding a symbol or an open paren is a unary minus
         this->self += minus_
            [
               unput(_start, _end, construct<string>(_start + 1, _end)),
               _val = construct<string>("-")
            ];

         // literal rules
         this->self += float_ | integer_ | string_ | symbol_;

         // ... other rules
      }

      ~Tokens() {}

      size_t lineNo() { return lineNo_; }

      // ignored tokens
      token_def<omit> whitespaces_, comments_;

      // literal tokens
      token_def<int> integer_;
      token_def<string>  float_, symbol_, string_;

      // operator tokens
      token_def<> plus_, difference_, minus_; // minus_ is a unary minus
      // ... other tokens

      // current line number
      size_t lineNo_;
   };
}

#endif // LEXER_H

基本的に、バイナリマイナス (differenceコード内で呼び出される) を両側に空白がある任意のマイナス記号として定義し、この規則を確実にするためにunputを使用しました。また、単項マイナスを記号または開き括弧の直前のマイナス記号として定義し、この規則が確実に維持されるように unput を使用しました (数値の場合、マイナス記号はトークンの一部です)。

boost - Spirit.Lex を使用して独特の単項マイナス記号を解析する

0 に答える 0

Related

Reference