Parse::RecDescent performance issue

474 Views Asked by At

I'm using Parse::RecDescent to parse lines in a Cisco IOS ACL. The ACL is used on the edge router of a large network, so it contains almost 8k lines which are set by the government. I'm looping through each of those lines and putting the values into a hash. Although it is 8k lines I'm still spending over 14 seconds parsing the lines? Does this sound reasonable? It seems VERY slow to me. Is there some overhead to using a hash verse another data structure?

Sample Input: (about 8k of these or similar)

deny   ip 2.3.4.5 0.0.0.7 any log-input
deny   ip 5.6.7.8 0.0.0.255 any log-input
deny   ip host 9.10.11.12 any log-input
deny   ip 13.14.15.16 0.0.31.255 any log-input
permit tcp host 17.18.19.20 host 21.22.23.24 eq bgp
permit icmp 25.26.0.0 0.0.255.255 27.28.0.0 0.0.255.255

Here is my entire parser:

package AccessList::Parser;

use strict;
use warnings;
use Carp;
use Scalar::Util 'blessed';
use Parse::RecDescent;

our $VERSION = '0.05';

sub new {
    my ($class) = @_;
    my $self = { PARSER => undef, };
    bless $self, $class;
    $self->_init();
    return $self;
}

sub _init {
    my ($self) = @_;
    $self->{PARSER} = Parse::RecDescent->new( $self->_grammar() );
}

sub parse {
    my ( $self, $string ) = @_;
    defined ($string) or confess "blank line received";
    my $tree = $self->{PARSER}->startrule($string);
    defined($tree) or confess "unrecognized line\n";
    return visit($tree);
}

#
# Finished tests
#

sub visit {
    my ($node) = @_;

    my $Rule_To_Key_Map = {
        "acl_action"              => 1,
        "acl_protocol"            => 1,
        "acl_src_ip"              => 1,
        "acl_src_port"            => 1,
        "acl_dst_ip"              => 1,
        "acl_dst_port"            => 1,
        "acl_remark"              => 1
    };

    my $parent_key;
    my $result;

    # set s of explored vertices
    my %seen;

    #stack is all neighbors of s
    my @stack;
    push @stack, [ $node, $parent_key ];

    my $key;

    while (@stack) {

        my $rec = pop @stack;

        $node       = $rec->[0];
        $parent_key = $rec->[1];    #undef for root

        next if ( $seen{$node}++ );

        my $rule_id = ref($node);

        if ( exists( $Rule_To_Key_Map->{$rule_id} ) ) {
            $parent_key = $rule_id;
        }

        foreach my $key ( keys %$node ) {
            next if ( $key eq "EOL" );
            my $next = $node->{$key};
            if ( blessed($next) ) {
                if ( exists( $next->{__VALUE__} ) ) {
                    #print ref($node), " ", ref($next), " ", $next->{__VALUE__},"\n";
                    my $rule  = ref($node);
                    my $token = $next->{__VALUE__};
                    $result->{$parent_key} = $token;
                    #print $rule, " ", $result->{$rule}, "\n";
                }
                push @stack, [ $next, $parent_key ];
                #push @stack, $next;
            }
        }
    }
    return $result;
}

sub _grammar {
    my ($self) = @_;

    my $grammar = q{
<autotree>

startrule :
        access_list EOL
    |   acl_remark EOL
    |   <error>

#
# access-lists
#

access_list : acl_action

acl_remark :
        "remark" REMARKS

acl_action :
        ACTIONS acl_protocol

#
# protocol options
#

acl_protocol :
        PROTOCOL acl_src_ip

#
# access-list source IP addresses
#

acl_src_ip :
        address acl_dst_ip
    |   address acl_src_port

#
# access-list source ports
#

acl_src_port : 
        port acl_dst_ip

#
# access-list destination IP address
#

acl_dst_ip :
        address acl_dst_port
    |   address acl_options
    | address CONNECTION_TYPE
    | address LAYER3_OPTIONS
    | IPRANGE

#
# access-list destination ports
#

acl_dst_port : 
        port acl_options
    |   acl_icmp_type acl_options

#
# icmp_types
#

acl_icmp_type :
       ICMP_TYPE

#
# access-list options
#

acl_options :
      acl_logging LAYER3_OPTIONS
    |   acl_logging
    |   EOL
    |   <error>

acl_logging :
            "log-input"
    |       "log"

#
# IP address types
#
# "object" should be fine here because "object" can not  
# be used to specify ports 

address :
        "host" IPADDRESS
    |   "host" NAME
    |   IPNETWORK
    | WILDCARD_NETWORK
    |   ANY


#
# port types
#

port :
        port_eq
    |   port_range
    |   port_gt
    |   port_lt
    |   port_neq

port_eq :
    "eq" PORT_ID

port_range :
    "range" PORT_RANGE

port_gt :
    "gt" PORT_GT

port_lt :
    "lt" PORT_LT

port_neq :
    "neq" <error: neq is unsupported>

#
# Token Definitions
#

STRING :
        /\S+/

DIGIT :
        /\d+/

NAME :
        /((^|\s[a-zA-Z])(\.|[0-9a-zA-Z_-]+)+)/

RULE_REF :
        /\S+/

ANY:
        "any"

IPADDRESS :
        /((\d{1,3})((\.)(\d{1,3})){3})/

MASK :
        /(((255\.){3}(255|254|252|248|240|224|192|128|0+))|((255\.){2}(255|254|252|248|240|224|192|128|0+)\.0)|((255\.)(255|254|252|248|240|224|192|128|0+)(\.0+){2})|((255|254|252|248|240|224|192|128|0+)(\.0+){3}))/

INVERSE_MASK :
        /(0+|1|3|7|15|31|63|127|255)((\.)(255|127|63|31|15|7|3|1|0)){3}/

WILDCARD_NETWORK :
        /((\d{1,3})((\.)(\d{1,3})){3}) (0+|1|3|7|15|31|63|127|255)((\.)(255|127|63|31|15|7|3|1|0)){3}/

IPNETWORK :
        /((\d{1,3})((\.)(\d{1,3})){3}) (((255\.){3}(255|254|252|248|240|224|192|128|0+))|((255\.){2}(255|254|252|248|240|224|192|128|0+)\.0)|((255\.)(255|254|252|248|240|224|192|128|0+)(\.0+){2})|((255|254|252|248|240|224|192|128|0+)(\.0+){3}))/

IPRANGE :
        /((\d{1,3})((\.)(\d{1,3})){3}) ((\d{1,3})((\.)(\d{1,3})){3})/

PROTOCOL :
        /\d+/ | "ahp" | "eigrp" | "esp" | "gre" | "icmp" | "icmp6" | "igmp" 
    | "igrp" | "ip" | "ipinip" | "ipsec" | "nos" | "ospf" | "pcp" 
    | "pim" | "pptp" | "snp" | "tcp" | "udp" | "41" 

GROUP_PROTOCOL :
        "tcp-udp" | "tcp" | "udp"

ICMP_TYPE : 
        /\d+/ | "alternate-address" | "conversion-error" | "echo-reply" | "echo"
    | "information-reply" | "information-request" | "mask-reply" | "mask-request"
    | "mobile-redirect" | "parameter-problem" | "redirect" | "router-advertisement"
    | "router-solicitation" | "source-quench" | "time-exceeded" | "timestamp-reply"
    | "timestamp-request" | "traceroute" | "unreachable"

CONNECTION_TYPE:
        "established"

LAYER3_OPTIONS:
        "fragments" | "packet-too-big"

PORT_ID :
        /\S+/

PORT_GT :
        /\S+/
{
    bless {__VALUE__=>"$item[1] 65535"}, $item[0]
}

PORT_LT :
        /\S+/
{
    bless {__VALUE__=>"1 $item[1]"}, $item[0]
}

PORT_RANGE :
        /\S+ \S+/

ACTIONS :
        "permit"
    |   "deny"

REMARKS :
        /.*$/

LOG_LEVEL :
        /\d+/ | "emergencies" | "alerts" | "critical" | "errors" 
    | "warnings" | "notifications" | "informational" | "debugging"
    | "disable"

EOL :
        /$/ 
};

    return $grammar;
}

1;
3

There are 3 best solutions below

2
On BEST ANSWER

Performance problems:

  • Factor out common prefixes (e.g. address in acl_dst_ip, IPRANGE in acl_dst_ip)
  • Remove unnecessary rules (e.g. access_list)

Functionality problems:

  • You mistakenly treat remarkfoo as remark. Similar mistakes elsewhere.
  • You allow newlines between tokens, but that doesn't seem to be desired.
  • You only allow single spaces between some tokens when you should have a more permitting definition of whitespace.
  • The same rule treats 0.0.127.4 0.0.127.255 as "from 0.0.127.4 to 0.0.127.255" and as "from 0.0.0.0 to 0.0.127.255". (The first one found win, so its treated as "from 0.0.0.0 to 0.0.127.255".) The distinction shouldn't even be made in the parser.

I started fixing up your code. (COMPLETELY UNTESTED)

# make_parser.pl

use strict;
use warnings;

use Parse::RecDescent qw( );

my $grammar = <<'__EOI__';

   {
      use strict;
      use warnings;

      use Socket qw( inet_aton );

      my %protocol_names = map { $_ => 1 } qw(
         ahp   eigrp  esp     gre    icmp  icmp6  igmp
         igrp  ip     ipinip  ipsec  nos   ospf   pcp
         pim   pptp   snp     tcp    udp
      );

      my %protocol_group_names = map { $_ => 1 } qw(
         tcp-udp  tcp  udp
      );

      my %icmp_type_names = map { $_ => 1 } qw(
         alternate-address    conversion-error     echo-reply     echo
         information-reply    information-request  mask-reply     mask-request
         mobile-redirect      parameter-problem    redirect       router-advertisement
         router-solicitation  source-quench        time-exceeded  timestamp-reply
         timestamp-request    traceroute           unreachable
      );

      sub parse_ipv4_addr {
         my ($addr) = @_;
         return inet_aton($addr);
      }
   }

   parse            : <skip: qr/[ \t]*/> line(s) /\Z/ { $item[2] }

   line             : line_body /\n|\Z/ { $item[1] }

   line_body        : PERMIT <commit> permit_deny_args { [ $item[1], $item[3] ] }
                    | DENY   <commit> permit_deny_args { [ $item[1], $item[3] ] }
                    | REMARK <commit> /[^\n]*/         { 0 }
                    | /[ \t]+/                         { 0 }

   permit_deny_args : protocol permit_deny_src permit_deny_dst { [ @item[1,2,3] ] }

   permit_deny_src  : addrs ports { [ @item[1, 2] ] }

   permit_deny_dst  : ...

   addrs            : HOST      <commit> ( IPv4_ADDR | DOMAIN ) { [ host  => $item[3]           ] }
                    | IPv4_ADDR <commit> IPv4_ADDR              { [ range => $item[1], $item[3] ] }
                    | ANY       <commit>                        { [ any   =>                    ] }

   ports            : EQ    <commit> IDENT       { [ permit => $item[2], $item[2] ] }
                    | NEQ   <commit> IDENT       { [ deny   => $item[2], $item[2] ] }
                    | GT    <commit> IDENT       { [ deny   => 1,        $item[2] ] }
                    | LT    <commit> IDENT       { [ deny   => $item[2], 65535    ] }
                    | RANGE <commit> IDENT IDENT { [ permit => $item[2], $item[3] ] }
                    |                            { [ permit => 1,        65535    ] }


   # Rules that match simply return what they match (i.e. no type info is returned).

   PROTOCOL_NAME    : IDENT { $protocol_names{$item[1]} ? $item[1] : undef }

   DOMAIN           : ...

   IPv4_ADDR        : /[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+/ { parse_ipv4_addr($item[1]) }

   # Keywords
   REMARK           : IDENT { $item[1] eq 'remark' ? $item[1] : undef }
   PERMIT           : IDENT { $item[1] eq 'permit' ? $item[1] : undef }
   DENY             : IDENT { $item[1] eq 'deny'   ? $item[1] : undef }
   ANY              : IDENT { $item[1] eq 'any'    ? $item[1] : undef }
   EQ               : IDENT { $item[1] eq 'eq'     ? $item[1] : undef }
   NEQ              : IDENT { $item[1] eq 'neq'    ? $item[1] : undef }
   LT               : IDENT { $item[1] eq 'lt'     ? $item[1] : undef }
   GT               : IDENT { $item[1] eq 'gt'     ? $item[1] : undef }

   IDENT            : /[a-zA-Z][a-zA-Z0-9_]*/

__EOI__

Parse::RecDescent->Precompile($grammar, 'Parser')
    or die("Bad grammar\n");

Run the above file, then you can use the parse as follows:

# test.pl

use strict;
use warnings;

use Data::Dumper qw( Dumper );
use Parser       qw( );

my $text = '...';

my $parser = Parser->new();

print(Dumper($parser->parse($text)));
0
On

Parse::RecDescent is slow. More to the point, recursive descent parsers tend to be slow. The best way to get a large improvement in speed would be to switch to a different parser, e.g. a LALR parser like Parse::Yapp.

If you want to try a less drastic approach to start with, see the guidelines for optimizing your grammars.

1
On

If you don't where your application is slow, sounds like you need to profile it. And Devel::NYTProf is the profiler of choice these days.