URL parsing using boost::spirit::x3

69 Views Asked by At

I'm trying parse and break-down URL into parts using boost::spirit::x3 as below:

#include <iostream>
#include <boost/fusion/adapted/std_tuple.hpp>
#include <boost/fusion/adapted/struct.hpp>
#include <boost/spirit/home/x3.hpp>

struct UrlParts { std::string prefix, host, suffix; };
BOOST_FUSION_ADAPT_STRUCT(UrlParts, prefix, host, suffix)

UrlParts parseSpirit(std::string_view input) {
    namespace x3 = boost::spirit::x3;

    static const auto scheme_    = (x3::raw[+x3::char_("a-zA-Z0-9+.-") >> "://"]);
    static const auto userinfo_  = (x3::raw[+~x3::char_("@") >> "@"]);
    static const auto prefix_    = (-scheme_ >> -userinfo_);
    static const auto port_      = (x3::raw[':' >> -x3::repeat(1, 5)[x3::digit] >> &(x3::char_("/?#") | x3::eoi)]);
    static const auto host_      = (+(x3::char_("a-fxXA-F0-9:.") - port_));
    static const auto path_      = (x3::char_("/?#") >> *x3::char_);    // to store path+query+fragment
    static const auto suffix_    = (-port_ >> -path_);

    //static const auto url = x3::rule<class url, UrlParts>() = -prefix_ >> ('[' >> host_ >> ']' | host_) >> -suffix_;
    static const auto url = -prefix_ >> ('[' >> host_ >> ']' | host_) >> -suffix_;    // prefix & suffix are optional but host is required

    // BOOST_SPIRIT_DEBUG_NODES((scheme_)(userinfo_)(host_)(port_)(path_)(url));

    // Parse the input
    auto iter = input.begin();
    auto end = input.end();
    UrlParts parts;
    auto attr = std::tie(parts.prefix, parts.host, parts.suffix);
    //parse(input.begin(), input.end(), x3::eps >> url >> x3::eoi, parts);
    bool ret = x3::parse(iter, end, url >> x3::eoi, attr);
    if (!ret) {
        std::cout << "Parsing failed" << std::endl;
    }
    return parts;
}

int main()
{
    for (auto input : {"http://usr:[email protected]:8080/file.php?abc=1#23",
                       "http://[::ffff:192.168.1.1]:8080/file.php?abc=1#23",
                       "http://::ffff:192.168.1.1/file.php?abc=1#23",
                       "::ffff:192.168.1.1"
                      }) {
        std::cout << "Input: " << input << std::endl;
        auto parts = parseSpirit("http://usr:[email protected]/file.php?abc=1");
        std::cout << "Output: Prefix: " << parts.prefix << ", Host: " << parts.host << ", Suffix: " << parts.suffix << std::endl;
        std::cout << "================" << std::endl;
    }
    return 0;
}

But above code fails to compile with error:

/usr/include/boost/spirit/home/x3/operator/detail/sequence.hpp:140:25: error: static assertion failed: Size of the passed attribute is less than expected.
  140 |             actual_size >= expected_size
      |             ~~~~~~~~~~~~^~~~~~~~~~~~~~~~
/usr/include/boost/spirit/home/x3/operator/detail/sequence.hpp:140:25: note: ‘(((int)boost::spirit::x3::detail::partition_attribute >, boost::spirit::x3::literal_string > > >, boost::spirit::x3::optional > >, boost::spirit::x3::literal_char > > > > >, boost::spirit::x3::alternative, boost::spirit::x3::plus, boost::spirit::x3::raw_directive, boost::spirit::x3::optional, boost::spirit::x3::detail::finite_count > > >, boost::spirit::x3::and_predicate, boost::spirit::x3::eoi_parser> > > > > > >, boost::spirit::x3::literal_char >, boost::spirit::x3::plus, boost::spirit::x3::raw_directive, boost::spirit::x3::optional, boost::spirit::x3::detail::finite_count > > >, boost::spirit::x3::and_predicate, boost::spirit::x3::eoi_parser> > > > > > > >, boost::spirit::x3::optional, boost::spirit::x3::optional, boost::spirit::x3::detail::finite_count > > >, boost::spirit::x3::and_predicate, boost::spirit::x3::eoi_parser> > > > >, boost::spirit::x3::optional, boost::spirit::x3::kleene > > > > >, std::tuple, std::allocator >&, std::__cxx11::basic_string, std::allocator >&, std::__cxx11::basic_string, std::allocator >&>, boost::spirit::x3::unused_type, void>::actual_size) >= ((int)boost::spirit::x3::detail::partition_attribute >, boost::spirit::x3::literal_string > > >, boost::spirit::x3::optional > >, boost::spirit::x3::literal_char > > > > >, boost::spirit::x3::alternative, boost::spirit::x3::plus, boost::spirit::x3::raw_directive, boost::spirit::x3::optional, boost::spirit::x3::detail::finite_count > > >, boost::spirit::x3::and_predicate, boost::spirit::x3::eoi_parser> > > > > > >, boost::spirit::x3::literal_char >, boost::spirit::x3::plus, boost::spirit::x3::raw_directive, boost::spirit::x3::optional, boost::spirit::x3::detail::finite_count > > >, boost::spirit::x3::and_predicate, boost::spirit::x3::eoi_parser> > > > > > > >, boost::spirit::x3::optional, boost::spirit::x3::optional, boost::spirit::x3::detail::finite_count > > >, boost::spirit::x3::and_predicate, boost::spirit::x3::eoi_parser> > > > >, boost::spirit::x3::optional, boost::spirit::x3::kleene > > > > >, std::tuple, std::allocator >&, std::__cxx11::basic_string, std::allocator >&, std::__cxx11::basic_string, std::allocator >&>, boost::spirit::x3::unused_type, void>::expected_size))’ evaluates to false

Any suggestions on what's wrong ? Similar code with boost::spirit::qi works fine.
Additionally, I'm interested in learning if there is a more efficient way of doing this, eg: using string_view instead of string, since all the 3 parts are present in input view.
Thanks in advance!

1

There are 1 best solutions below

2
On BEST ANSWER

It means the attributes are not detected as compatible for the parser expression. See it broken in 1.76:

The first problem I see is trying to synthesize a single string (prefix) out of two raw[] directives. That's... not gonna work.

The good news is it works (again?) starting with 1.77, but in general consider being a bit more explicit with the attribute compatibilty, like as_type from this answer Understanding the List Operator (%) in Boost.Spirit, or the many others https://stackoverflow.com/search?tab=newest&q=user%3a85371%20x3%20as_type&searchOn=3 or even more if you look for the name as which I usually prefer... https://stackoverflow.com/search?q=user%3A85371+x3+as+x3%3A%3Arule

That said, when I look at your code, you bind a manually tied tuple AND somehow still adapt the struct? That's redundant. I'd assume you want to just tie manually, so drop the adaptation.

Next up, by all means, don't use a parser combinator library as a tokenizer. I.e., don't randomly bunch together unrelated productions (suffix really should not contain the port specification).

Also, parse into a real port number using... an integer parser. Certainly if you're going to be pedantic strict about the number of digits allowed anyways! See here:

auto portnum_   = x3::uint_parser<uint16_t, 10, 1, 5>{};
auto portspec_  = ':' >> portnum_ >> &(x3::char_("/?#") | x3::eoi);

Be careful with double optionality. E.g. Since suffix_ = -port_ >> -path_ literally has only optional elements, the expression -suffix_ at best has the same meaning as suffix_. However, there are lots of situations (optional repeating constructs) where you will get infinite loops of zero-length matches.

I suppose / or # should also end the user info production.

Not everything needs to be raw. E.g. I'd prefer

auto userinfo_  = +~x3::char_("@/#") >> x3::char_("@");

for the userinfo. In fact, if you do

auto authority_ = ('[' >> host_ >> ']' | host_) >> -portspec_;

You will correctly get the parsed host without the [] brackets, which have meaning ONLY in the URI grammar.

You had the end-of-input validation commented out, let's re-enable those. Let's also return more richt information (including valid flag) and a lot more test cases:

Live On Compiler Explorer

#include <boost/fusion/adapted/std_tuple.hpp>
#include <boost/spirit/home/x3.hpp>
#include <iostream>

struct UrlParts { std::string prefix, host, suffix; };

UrlParts parseSpirit(std::string_view input) {
    namespace x3 = boost::spirit::x3;

    auto scheme_   = x3::raw[+x3::char_("a-zA-Z0-9+.-") >> "://"];
    auto userinfo_ = x3::raw[+~x3::char_("@") >> "@"];
    auto prefix_   = scheme_ >> -userinfo_;
    auto port_     = x3::raw[':' >> -x3::repeat(1, 5)[x3::digit] >> &(x3::char_("/?#") | x3::eoi)];
    auto host_     = +(x3::char_("a-fxXA-F0-9:.") - port_);
    auto path_     = x3::char_("/?#") >> *x3::char_; // to store path+query+fragment
    auto suffix_   = -port_ >> -path_;

    // static const auto url =  = -prefix_ >> ('[' >> host_ >> ']' | host_) >>
    // -suffix_;
    auto url                              //
        //= x3::rule<class url, UrlParts>() //
        = -prefix_                        //
                                          //>> ('[' >> host_ >> ']' | host_) >>
        // -suffix_ // prefix & suffix are optional but host is required
        ;

    // Parse the input
    auto iter = input.begin();
    auto end = input.end();
    UrlParts p;
    auto attr = std::tie(p.prefix/*, p.host, p.suffix*/);
    bool ret = x3::parse(iter, end, url >> x3::eoi, attr);
    if (!ret) {
        std::cout << "Parsing failed" << std::endl;
    }
    return p;
}

int main() {
    for (auto input : {"http://usr:[email protected]:8080/file.php?abc=1#23",
                       "http://[::ffff:192.168.1.1]:8080/file.php?abc=1#23",
                       "http://::ffff:192.168.1.1/file.php?abc=1#23", "::ffff:192.168.1.1"}) {
        std::cout << "Input: " << input << std::endl;
        auto parts = parseSpirit("http://usr:[email protected]/file.php?abc=1");
        std::cout << "Output: Prefix: " << parts.prefix << ", Host: " << parts.host << ", Suffix: " << parts.suffix << std::endl;
        std::cout << "================" << std::endl;
    }
}

Printing

192.168.1.1 {true, "", "", "192.168.1.1", "", unspecified}
192.168.1.1/    {true, "", "", "192.168.1.1", "/", unspecified}
192.168.1.1/file.php    {true, "", "", "192.168.1.1", "/file.php", unspecified}
192.168.1.1/file.php?abc=1  {true, "", "", "192.168.1.1", "/file.php?abc=1", unspecified}
192.168.1.1:8888    {true, "", "", "192.168.1.1", "", 8888}
192.168.1.1:8888/   {true, "", "", "192.168.1.1", "/", 8888}
192.168.1.1:8888/file.php   {true, "", "", "192.168.1.1", "/file.php", 8888}
192.168.1.1:8888/file.php?abc=1 {true, "", "", "192.168.1.1", "/file.php?abc=1", 8888}
::ffffff::192.168.1.1:9999/file.php?abc=1   {true, "", "", "::ffffff::192.168.1.1", "/file.php?abc=1", 9999}
http://192.168.1.1  {true, "http://", "", "192.168.1.1", "", unspecified}
http://192.168.1.1/ {true, "http://", "", "192.168.1.1", "/", unspecified}
http://192.168.1.1/file.php {true, "http://", "", "192.168.1.1", "/file.php", unspecified}
http://192.168.1.1/file.php?abc=1   {true, "http://", "", "192.168.1.1", "/file.php?abc=1", unspecified}
http://192.168.1.1:8888 {true, "http://", "", "192.168.1.1", "", 8888}
http://192.168.1.1:8888/    {true, "http://", "", "192.168.1.1", "/", 8888}
http://192.168.1.1:8888/file.php    {true, "http://", "", "192.168.1.1", "/file.php", 8888}
http://192.168.1.1:8888/file.php?abc=1  {true, "http://", "", "192.168.1.1", "/file.php?abc=1", 8888}
http://::ffffff::192.168.1.1:9999/file.php?abc=1    {true, "http://", "", "::ffffff::192.168.1.1", "/file.php?abc=1", 9999}
http://[email protected] {true, "http://", "sehe@", "192.168.1.1", "", unspecified}
http://[email protected]/    {true, "http://", "sehe@", "192.168.1.1", "/", unspecified}
http://[email protected]/file.php    {true, "http://", "sehe@", "192.168.1.1", "/file.php", unspecified}
http://[email protected]:8888    {true, "http://", "sehe@", "192.168.1.1", "", 8888}
http://[email protected]:8888/   {true, "http://", "sehe@", "192.168.1.1", "/", 8888}
http://[email protected]:8888/file.php   {true, "http://", "sehe@", "192.168.1.1", "/file.php", 8888}
http://usr:[email protected]/file.php?abc=1   {true, "http://", "usr:pwd@", "192.168.1.1", "/file.php?abc=1", unspecified}
[email protected]    {true, "", "sehe@", "192.168.1.1", "", unspecified}
[email protected]/   {true, "", "sehe@", "192.168.1.1", "/", unspecified}
[email protected]/file.php   {true, "", "sehe@", "192.168.1.1", "/file.php", unspecified}
[email protected]:8888   {true, "", "sehe@", "192.168.1.1", "", 8888}
[email protected]:8888/  {true, "", "sehe@", "192.168.1.1", "/", 8888}
[email protected]:8888/file.php  {true, "", "sehe@", "192.168.1.1", "/file.php", 8888}
usr:[email protected]/file.php?abc=1  {true, "", "usr:pwd@", "192.168.1.1", "/file.php?abc=1", unspecified}

BUT WAIT - THE PROBLEM?

If you must use old Boost versions, we see the problem is back...

So let's fix that by re-instating the raw[] around the user-info just as a hint. I'd rather upgrade Boost obviously:

auto userinfo_  = x3::raw[+~x3::char_("@/#") >> x3::char_("@")];

See it live on Boost 1.76 here: https://godbolt.org/z/6bc5Ycrcr

The output is identical, md5 checksums:

c69881691195579e3184ef6024136356  1.76
c69881691195579e3184ef6024136356  1.84