I'm trying to write an HTTP parser in C++ using ragel, but finding that the parser generated is not greedy.
The first step is to parse a URL, and here is the ragel grammar translated from RFC-3986:
#include <string>
#include <iostream>
#include <string_view>
%%{
machine http_uri_parser;
action mark_scheme {
std::printf("mark scheme, p:%s\n", p);
this->scheme.data = p;
}
action store_scheme {
std::printf("store scheme, p:%s\n", p);
this->scheme.len = p - this->scheme.data;
}
action mark_authority {
std::printf("mark authority, p:%s\n", p);
this->authority.data = p;
}
action store_authority {
std::printf("store authority, p:%s\n", p);
this->authority.len = p - this->authority.data;
}
action mark_userinfo {
std::printf("mark userinfo, p:%s\n", p);
this->userinfo.data = p;
}
action store_userinfo {
std::printf("store userinfo, p:%s\n", p);
this->userinfo.len = p - this->userinfo.data;
}
action mark_host {
std::printf("mark host, p:%s\n", p);
this->host.data = p;
}
action store_host {
std::printf("store host, p:%s\n", p);
this->host.len = p - this->host.data;
}
action mark_port {
std::printf("mark port, p:%s\n", p);
this->port.data = p;
}
action store_port {
std::printf("store port, p:%s\n", p);
this->port.len = p - this->port.data;
}
action mark_path {
std::printf("mark path, p:%s\n", p);
this->path.data = p;
}
action store_path {
std::printf("store path, p:%s\n", p);
this->path.len = p - this->path.data;
}
action mark_query {
std::printf("mark query, p:%s\n", p);
this->query.data = p;
}
action store_query {
std::printf("store query, p:%s\n", p);
this->query.len = p - this->query.data;
}
action mark_fragment {
std::printf("mark fragment, p:%s\n", p);
this->fragment.data = p;
}
action store_fragment {
std::printf("store fragment, p:%s\n", p);
this->fragment.len = p - this->fragment.data;
}
action done {
std::printf("parser done, p:%s\n", p);
this->_done = 1;
fbreak;
}
###############################################################################
# Characters
###############################################################################
crlf = '\r\n';
gen_delims = ( ':' | '/' | '?' | '#' | '[' | ']' | '@' );
sub_delims = ( '!' | '$' | '&' | "'" | '(' | ')'
| '*' | '+' | ',' | ';' | '=' );
reserved = ( gen_delims | sub_delims );
unreserved = ( alpha | digit | '-' | '.' | '_' | '~' );
pct_encoded = ( '%' xdigit xdigit );
###############################################################################
# Scheme
###############################################################################
scheme = ( alpha ( alpha | digit | '+' | '-' | '.' )* )
>mark_scheme %store_scheme;
###############################################################################
# Authority
###############################################################################
dec_octet = ( ( digit ) # 0-9
| ( ( '1'..'9' ) digit ) # 10-99
| ( '1' digit{2} ) # 100-199
| ( '2' ( '0'..'4' ) digit ) # 200-249
| ( '25' ( '0'..'5' ) ) # 250-255
);
IPv4_address = ( dec_octet '.' dec_octet '.' dec_octet '.' dec_octet );
h16 = ( xdigit{1,4} );
# 16 bits of address represented in hexadecimal
ls32 = ( ( h16 ':' h16 ) | IPv4_address );
# least-significant 32 bits of address
IPv6_address = ( ( ( h16 ':' ){6} ls32 )
| ( '::' ( h16 ':' ){5} ls32 )
| ( ( h16 )? '::' ( h16 ':' ){4} ls32 )
| ( ( ( h16 ':' ){,1} h16 )? '::' ( h16 ':' ){3} ls32 )
| ( ( ( h16 ':' ){,2} h16 )? '::' ( h16 ':' ){2} ls32 )
| ( ( ( h16 ':' ){,3} h16 )? '::' ( h16 ':' ){1} ls32 )
| ( ( ( h16 ':' ){,4} h16 )? '::' ls32 )
| ( ( ( h16 ':' ){,5} h16 )? '::' h16 )
| ( ( ( h16 ':' ){,6} h16 )? '::' )
);
IPv_future = ( 'v' ( ( xdigit+ ) '.' ) ( unreserved | sub_delims | ':' )+ );
IP_literal = ( '[' ( IPv6_address | IPv_future ) ']' );
reg_name = ( ( unreserved | pct_encoded | sub_delims )* )
> { std::printf("mark reg_name, p:%s\n", p); }
% { std::printf("store reg_name, p:%s\n", p); };
port = ( digit* )
>mark_port %store_port;
host = ( IP_literal | IPv4_address | reg_name )
>mark_host %store_host;
userinfo = ( ( unreserved | pct_encoded | sub_delims | ':' )* )
>mark_userinfo %store_userinfo;
authority = ( ( userinfo '@' )? host ( ':' port )? )
>mark_authority %store_authority;
###############################################################################
# Path
###############################################################################
pchar = ( unreserved | pct_encoded | sub_delims | ':' | '@' );
segment = ( pchar* );
segment_nz = ( pchar+ );
# non-zero-length
segment_nz_nc = ( ( unreserved | pct_encoded | sub_delims | '@' )+ );
# non-zero-length segment without any colon ':'
path_abempty = ( ( '/' segment )* )
>mark_path %store_path;
path_absolute = ( '/' ( segment_nz ( '/' segment )* )? )
>mark_path %store_path;
path_noscheme = ( segment_nz_nc ( '/' segment )* )
>mark_path %store_path;
path_rootless = ( segment_nz ( '/' segment )* )
>mark_path %store_path;
path_empty = ( zlen )
>mark_path %store_path;
path = ( path_abempty # begins with '/' or is empty
| path_absolute # begins with '/' but not '//'
| path_noscheme # begins with a non-colon segment
| path_rootless # begins with a segment
| path_empty # zero characters
);
###############################################################################
# Query
###############################################################################
query = ( ( pchar | '/' | '?' )* )
>mark_query %store_query;
###############################################################################
# Fragment
###############################################################################
fragment = ( ( pchar | '/' | '?' )* )
>mark_fragment %store_fragment;
###############################################################################
# URI
###############################################################################
hier_part = ( ( '//' authority path_abempty )
| ( path_absolute )
| ( path_rootless )
| ( path_empty )
);
relative_part = ( ( '//' authority path_abempty )
| ( path_absolute )
| ( path_noscheme )
| ( path_empty )
);
absolute_URI = ( scheme ':' hier_part ( '?' query )? );
relative_ref = ( relative_part ( '?' query )? ( '#' fragment )? );
URI = ( scheme ':' hier_part ( '?' query )? ( '#' fragment )? );
URI_reference = ( URI | relative_ref );
###############################################################################
# main rule
###############################################################################
main := URI @done;
}%%
%% write data;
struct slice {
size_t len{};
const char* data{};
};
struct http_parser {
http_parser() = default;
~http_parser() = default;
void reset();
void execute();
int state = 0;
std::string uri;
/* parsed result */
slice scheme{};
slice authority{};
slice userinfo{};
slice host{};
slice port{};
slice path{};
slice query{};
slice fragment{};
/* parse status */
bool _eof{};
bool _done{};
bool _failed{};
};
void http_parser::reset() {
int cs = 0;
%% write init;
this->state = cs;
this->_eof = false;
this->_done = false;
this->_failed = false;
this->scheme = slice{};
this->authority = slice{};
this->userinfo = slice{};
this->host = slice{};
this->port = slice{};
this->path = slice{};
this->query = slice{};
this->fragment = slice{};
}
void http_parser::execute() {
const char* p = &this->uri.front();
const char* pe = &this->uri.back() + 1;
const char* eof = pe;
int cs = this->state;
%% write exec;
if (!this->_failed) {
this->state = cs;
}
std::printf(
"eof:%d, done:%d, failed:%d, state:%d, p:%p, pe:%p, diff:%ld, rest:%s\n",
this->_eof, this->_done, this->_failed, this->state, p, pe, pe - p, p);
#define print_parser_component(fld) \
if (this->fld.len) { \
std::printf(#fld ": %.*s\n", (int)this->fld.len, this->fld.data); \
}
print_parser_component(scheme);
print_parser_component(authority);
print_parser_component(userinfo);
print_parser_component(host);
print_parser_component(port);
print_parser_component(path);
print_parser_component(query);
print_parser_component(fragment);
#undef print_parser_component
}
Here I set the main rule to URL rather than URI_reference in order to test absolute URL first.
And here is the test code:
int main(int argc, char** argv) {
auto parser = std::make_unique<http_parser>();
parser->uri =
"https://chenjianyong.com/blog/2022/01/"
"seastar_fpc_1.html?hello=world#preface";
parser->reset();
parser->execute();
return 0;
}
Run the program and it prints:
mark scheme, p:https://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
store scheme, p:://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
parser done, p:://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
eof:0, done:1, failed:0, state:171, p:0x6000016f4006, pe:0x6000016f404c, diff:70, rest://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
scheme: https
It seems that the parser stops after parsing the scheme https://, its so weird! Why doesn't it greedily consume to the last byte?
After changing the main rule to main := (crlf @done);, append a crlf to the test URL, regenerate the parser and this time the parser can consume to the end and the print shows that all URL components are parsed successfully:
mark scheme, p:https://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
store scheme, p:://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
mark path, p://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
mark authority, p:chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
mark userinfo, p:chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
mark host, p:chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
mark reg_name, p:chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
store reg_name, p:/blog/2022/01/seastar_fpc_1.html?hello=world#preface
store host, p:/blog/2022/01/seastar_fpc_1.html?hello=world#preface
store authority, p:/blog/2022/01/seastar_fpc_1.html?hello=world#preface
mark path, p:/blog/2022/01/seastar_fpc_1.html?hello=world#preface
store path, p:?hello=world#preface
mark query, p:hello=world#preface
store query, p:#preface
mark fragment, p:preface
store fragment, p:
parser done, p:
eof:0, done:1, failed:0, state:188, p:0x60000140c04e, pe:0x60000140c04e, diff:0, rest:
scheme: https
authority: chenjianyong.com
host: chenjianyong.com
path: /blog/2022/01/seastar_fpc_1.html
query: hello=world
fragment: preface
So, why isn't my ragel parser greedy?
As you have already noticed, your
doneaction is executed right after the colon (withfbreakinside) and we can confirm it by rendering your (quite big one!) FSM (ragel -o f.dot -Vp source.c++ && dot -Tpng -o f.png f.dot):It's executed because
mainspecification says@donewhich, according to the documentationAnd as you can see, 171 is one of the possible final states of your machine. If you're to change
mainspecification toURI %doneit's gonna be a bit different:doneis no longer executed, it's going to be only afterWhich I think is a bit more appropriate for your case.
Now you probably wonder why 171 is one of the final states and that's because
hier_partamong other things can bepath_emptywhich is azlen, so it's OK for machine to finish in this state (forscheme:input) -> it's one of the final states ->@doneis executed when transitioning to this state.