Boost Spirit lexer states cross pollinate

307 Views Asked by At

I am trying to use lexer states to do context specific parsing, but it seems that different lexer states do cross-pollinate. Here is a very basic example

#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <boost/spirit/include/phoenix_container.hpp>

#include <iostream>
#include <string>

using namespace boost::spirit;

template <typename Lexer>
struct strip_comments_tokens : lex::lexer<Lexer>
{
    strip_comments_tokens() 
      : strip_comments_tokens::base_type(lex::match_flags::match_default)
    {
        ccomment = "\\/\\*";
        endcomment = ".*\\*\\/";
        hello = "hello";

        this->self.add
            (ccomment)
            (hello);

        this->self("COMMENT").add
            (endcomment);
    }

    lex::token_def<> ccomment, endcomment;
    lex::token_def<std::string> hello;
};

template <typename Iterator>
struct strip_comments_grammar : qi::grammar<Iterator>
{
    template <typename TokenDef>
    strip_comments_grammar(TokenDef const& tok)
      : strip_comments_grammar::base_type(start)
    {
        start =  *(   tok.ccomment 
                      >>  qi::in_state("COMMENT") 
                      [
                          tok.endcomment 
                      ]
              |   tok.hello [ std::cout << _1 ]
        );
    }

    qi::rule<Iterator> start;
};


int main(int argc, char* argv[])
{
    typedef std::string::iterator base_iterator_type;

    typedef 
        lex::lexertl::lexer<lex::lexertl::token<base_iterator_type> > 
    lexer_type;

    typedef strip_comments_tokens<lexer_type>::iterator_type iterator_type;

    strip_comments_tokens<lexer_type> strip_comments;           // Our lexer
    strip_comments_grammar<iterator_type> g (strip_comments);   // Our parser 

    std::string str("hello/*hello*/hello");
    base_iterator_type first = str.begin();

    bool r = lex::tokenize_and_parse(first, str.end(), strip_comments, g);

    return 0;
}

I would expect the input

"hello/*hello*/hello"

to be tokenized as hello ccomment endcomment hello. But what happens is the input gets tokenized as hello ccomment hello, so the grammar stops working. If you change the input to

"hello/*anything else*/hello" 

everything works as expected.

Any ideas?

1

There are 1 best solutions below

5
sehe On

You never modify the state of the lexer. So it's always in the "INITIAL" state.

Setting the lexer state should be done in the Lexer stage (there's no reliable way to feedback from the parser stage, in my experience and after much experimentation).

So you need to upgrade to actor_lexer and attach semantic actions to the token_defs added to the lexer tables:

typedef 
    lex::lexertl::actor_lexer<lex::lexertl::token<base_iterator_type> > 
lexer_type;

And

this->self += 
     ccomment [ lex::_state = "COMMENT" ]
   | hello;

this->self("COMMENT") += 
    endcomment [ lex::_state = "INITIAL" ];

That said, I suppose it's much easier to just skip the tokens altogether. If you really want to know how to use Lexer states for skipping see:

I'd suggest the Simplify And Profit approach using lex::_pass = lex::pass_flags::pass_ignore though:

Here's my take:

Live On Coliru

#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/spirit/include/qi.hpp> // for the parser expression *strip_comments.hello

namespace lex = boost::spirit::lex;
namespace phx = boost::phoenix;

template <typename Lexer>
struct strip_comments_tokens : lex::lexer<Lexer> {
    strip_comments_tokens() 
      : strip_comments_tokens::base_type(lex::match_flags::match_default)
    {
        ccomment   = "\\/\\*.*\\*\\/";
        hello      = "hello"; // why not "."?

        this->self += 
             ccomment [ lex::_pass = lex::pass_flags::pass_ignore ]
  // IDEA: | lex::token_def<char>(".") // to just accept anything
           | hello
           ;
    }

    lex::token_def<lex::omit>   ccomment;
    lex::token_def<std::string> hello;
};

int main() {
    typedef std::string::const_iterator base_iterator_type;
    typedef lex::lexertl::actor_lexer<
                lex::lexertl::token<base_iterator_type/*, boost::mpl::vector<char, std::string>, boost::mpl::false_*/>
            > lexer_type;

    strip_comments_tokens<lexer_type> strip_comments;         // Our lexer

    std::string const str("hello/*hello*/hello");
    std::string stripped;

    base_iterator_type first = str.begin();
    bool r = lex::tokenize_and_parse(first, str.end(), strip_comments, *strip_comments.hello, stripped);

    if (r)
        std::cout << "\nStripped: '" << stripped << "'\n";
    else
        std::cout << "Failed: '" << std::string(first, str.end()) << "'\n";
}