SimpleParse not showing the result tree

128 Views Asked by At

I am working on the Google ProtoBuff where I am trying to parse the proto file using SimpleParse in python.

I am using EBNF format with SimpleParse, it shows success but there is nothing in the result Tree, not sure what is going wrong. Any help would really be appreciated.

Following is the grammar file:

  proto ::= ( message / extend / enum / import / package / option / ';' )*

    import ::= 'import' , strLit , ';'

    package ::= 'package' , ident , ( '.' , ident )* , ';'

    option ::= 'option' , optionBody , ';'

    optionBody ::= ident , ( '.' , ident )* , '=' , constant

    message ::= 'message' , ident , messageBody

    extend ::= 'extend' , userType , '{' , ( field / group / ';' )* , '}'

    enum ::= 'enum' , ident , '{' , ( option / enumField / ';' )* , '}'

    enumField ::= ident , '=' , intLit , ';'

    service ::= 'service' , ident , '{' , ( option / rpc / ';' )* , '}'

    rpc ::= 'rpc' , ident ,  '(' , userType , ')' , 'returns' , '(' , userType , ')' , ';'

    messageBody ::= '{' , ( field / enum / message / extend / extensions / group / option / ':' )* , '}'

    group ::= label , 'group' , camelIdent , '=' , intLit , messageBody

    field ::= label , type , ident , '=' , intLit , ( '[' , fieldOption , ( ',' , fieldOption )* , ']' )? , ';'

    fieldOption ::= optionBody / 'default' , '=' , constant

    extensions ::= 'extensions' , extension , ( ',' , extension )* , ';'

    extension ::= intLit , ( 'to' , ( intLit / 'max' ) )?

    label ::= 'required' / 'optional' / 'repeated'

    type ::= 'double' / 'float' / 'int32' / 'int64' / 'uint32' / 'uint64' / 'sint32' / 'sint64' / 'fixed32' / 'fixed64' / 'sfixed32' / 'sfixed64' / 'bool' / 'string' / 'bytes' / userType

    userType ::= '.'? , ident , ( '.' , ident )*

    constant ::= ident / intLit / floatLit / strLit / boolLit

    ident ::= [A-Za-z_],[A-Za-z0-9_]*

    camelIdent ::= [A-Z],[\w_]*

    intLit ::= decInt / hexInt / octInt

    decInt ::= [1-9],[\d]*

    hexInt ::= [0],[xX],[A-Fa-f0-9]+

    octInt ::= [0],[0-7]+

    floatLit ::= [\d]+ , [\.\d+]? 

    boolLit ::= 'true' / 'false'

    strLit ::= quote ,( hexEscape / octEscape / charEscape / [^\0\n] )* , quote

    quote ::= ['']

    hexEscape ::= [\\],[Xx],[A-Fa-f0-9]
    octEscape ::= [\\0]? ,[0-7]
    charEscape ::= [\\],[abfnrtv\\\?'']

And this is the python code that I am experimenting with:

from simpleparse.parser import Parser
from pprint import pprint


protoGrammar = ""
protoInput = ""
protoGrammarRoot = "proto"

with open ("proto_grammar.ebnf", "r") as grammarFile:
    protoGrammar=grammarFile.read()

with open("sample.proto", "r") as protoFile:
    protoInput = protoFile.read().replace('\n', '')

parser = Parser(protoGrammar,protoGrammarRoot)

success, resultTree, newCharacter = parser.parse(protoInput)

pprint(protoInput)

pprint(success)

pprint(resultTree)

pprint(newCharacter)

and this the proto file that I am trying to parse

message AmbiguousMsg {
  optional string mypack_ambiguous_msg = 1;
  optional string mypack_ambiguous_msg1 = 1;
}

I get the output as

1
[]
0
1

There are 1 best solutions below

0
On

I am new to Python but I came up with this, although I am not entirely sure of your output format. Hopefully this will point you in the right direction. Feel free to modify the code below to cater your requirements.

#!/usr/bin/python
# (c) 2015 enthusiasticgeek for StackOverflow. Use the code in anyway you want but leave credits intact. Also use this code at your own risk. I do not take any responsibility for your usage - blame games and trolls will strictly *NOT* be tolerated. 
import re
#data_types=['string','bool','enum','int32','uint32','int64','uint64','sint32','sint64','bytes','string','fixed32','sfixed32','float','fixed64','sfixed64','double']

#function # 1
#Generate list of units in the brackets
#================ tokens based on braces ====================
def find_balanced_braces(args):
    parts = []
    for arg in args:
    if '{' not in arg:
        continue
    chars = []
    n = 0
    for c in arg:
        if c == '{':
            if n > 0:
                chars.append(c)
            n += 1
        elif c == '}':
            n -= 1
            if n > 0:
                chars.append(c)
            elif n == 0:
                parts.append(''.join(chars).lstrip().rstrip())
                chars = []
        elif n > 0:
            chars.append(c)
    return parts

#function # 2
#================ Retrieve Nested Levels ====================
def find_nested_levels(test, count_level):
 count_level=count_level+1
 level = find_balanced_braces(test)
 if not bool(level):
  return count_level-1
 else:
  return find_nested_levels(level,count_level)

#function # 3
#================ Process Nested Levels ====================
def process_nested_levels(test, count_level):
 count_level=count_level+1
 level = find_balanced_braces(test)
 print "===== Level = " + str(count_level) + " ====="
 for i in range(len(level)):
     #print level[i] + "\n"
     exclusive_level_messages = ''.join(level[i]).split("message")[0]
     exclusive_level_messages_tokenized = ''.join(exclusive_level_messages).split(";")
     #print  exclusive_level_messages + "\n"
     for j in range(len(exclusive_level_messages_tokenized)):
     pattern = exclusive_level_messages_tokenized[j].lstrip()
     print pattern
     #match = "\message \s*(.*?)\s*\{"+pattern
     #match_result = re.findall(match, level[i])
     #print match_result        
 print "===== End Level ====="
 if not bool(level):
  return count_level-1
 else:
  return process_nested_levels(level,count_level)
#============================================================
#=================================================================================
test_string=("message a{ optional string level-i1.l1.1 = 1 [default = \"/\"]; "
"message b{ required bool level-i1.l2.1 = 1; required fixed32 level-i1.l2.1 = 2; "
"message c{ required string level-i1.l3.1 = 1; } "
"} "
"} "
"message d{ required uint64 level-i2.l1.1 = 1; required double level-i2.l1.2 = 2; "
"message e{ optional double level-i2.l2.1 = 1; "
"message f{ optional fixed64 level-i2.l3.1 = 1; required fixed32 level-i2.l3.2 = 2; "
"message g{ required bool level-i2.l4.1 = 2; } "
"} "
"} "
"} "
"message h{ required uint64 level-i3.l1.1 = 1; required double level-i3.l1.2 = 2; }")

#Right now I do not see point in replacing \n with blank space
with open ("fileproto.proto", "r") as myfile:
   data=myfile.read().replace('\n', '\n')
print data

count_level=0
#replace 'data' in the following line with 'test_string' for tests
nested_levels=process_nested_levels([data],count_level)
print "Total count levels depth = " + str(nested_levels)
print "========================\n"

My output looks as follows

// This defines protocol for a simple server that lists files.
//
// See also the nanopb-specific options in fileproto.options.

message ListFilesRequest {
    optional string path = 1 [default = "/"];
}

message FileInfo {
    required uint64 inode = 1;
    required string name = 2;
}

message ListFilesResponse {
    optional bool path_error = 1 [default = false];
    repeated FileInfo file = 2;
}


===== Level = 1 =====
optional string path = 1 [default = "/"]

required uint64 inode = 1
required string name = 2

optional bool path_error = 1 [default = false]
repeated FileInfo file = 2

===== End Level =====
===== Level = 2 =====
===== End Level =====
Total count levels depth = 1
========================

NOTE After print pattern you can tokenize further if necessary by taking pattern as in input. I have commented one example with regex.