tokenizer and parser returns wrong answer for a postfix notation

Question

tokenizer and parser returns wrong answer for a postfix notation

54 Views Asked by Little At 27 July 2025 at 15:42

I have coded a tokenizer and a recursive parser for a postfix expression. My code is the following:

import re

token_patterns = [
    ('OPERATOR', r'[+\-*/]'),
    ('NUMBER', r'\d+'),
    ('WHITESPACE', r'\s+'),
]

def tokenize(source_code):
    tokens = []
    source_code = source_code.strip()

    while source_code:
        matched = False

        for token_type, pattern in token_patterns:
            match = re.match(pattern, source_code)
            if match:
                value = match.group(0)
                tokens.append((token_type, value))
                source_code = source_code[len(value):].lstrip()
                matched = True
                break

        if not matched:
            raise ValueError(f"Invalid character in source code: {source_code[0]}")

    return tokens

def parse_expression(tokens):
    if not tokens:
        return None

    token_type, value = tokens.pop(0)

    if token_type == 'NUMBER':
        return int(value)
    elif token_type == 'OPERATOR':
        if value in ('+', '-', '*', '/'):
            right = parse_expression(tokens)
            left = parse_expression(tokens)
            return (value, left, right)
        else:
            raise ValueError(f"Unexpected operator: {value}")
    else:
        raise ValueError(f"Unexpected token: {token_type}")

def evaluate_expression(expression):
    if isinstance(expression, int):
        return expression
    elif isinstance(expression, tuple):
        operator, left, right = expression
        if operator == '+':
            return evaluate_expression(left) + evaluate_expression(right)
        elif operator == '-':
            return evaluate_expression(left) - evaluate_expression(right)
        elif operator == '*':
            return evaluate_expression(left) * evaluate_expression(right)
        elif operator == '/':
            return evaluate_expression(left) / evaluate_expression(right)
    else:
        raise ValueError(f"Invalid expression: {expression}")

def main():
    source_code = "2 3 4 * +"
    tokens = tokenize(source_code)
    parsed_expression = parse_expression(tokens)
    
    print(f"Source code: {source_code}")
    print(f"Parsed expression: {parsed_expression}")
    
    result = evaluate_expression(parsed_expression)
    print(f"Result: {result}")

if __name__ == "__main__":
    main()

The part of the tokenize function is working correctly, giving me:

[('NUMBER', '2'), ('NUMBER', '3'), ('NUMBER', '4'), ('OPERATOR', '*'), ('OPERATOR', '+')]

but I would like to return in the

print(f"Parsed expression: {parsed_expression}")

something like this:

Parsed expression: ('+', 2, ('*', 3, 4))

however it only prints 2. Also, the result should be 14 and I also got 2. I am not able to find the mistake. Any help?

Original Q&A

There are 1 best solutions below

**Reilas** · Answer 1

For reference, here is the recursive descent parser from the book, "Python Cookbook", by O'Reilly Media.
GitHub – writing_a_simple_recursive_descent_parser.

# example.py
#
# An example of writing a simple recursive descent parser

import re
import collections

# Token specification
NUM    = r'(?P<NUM>\d+)'
PLUS   = r'(?P<PLUS>\+)'
MINUS  = r'(?P<MINUS>-)'
TIMES  = r'(?P<TIMES>\*)'
DIVIDE = r'(?P<DIVIDE>/)'
LPAREN = r'(?P<LPAREN>\()'
RPAREN = r'(?P<RPAREN>\))'
WS     = r'(?P<WS>\s+)'

master_pat = re.compile('|'.join([NUM, PLUS, MINUS, TIMES, 
                                  DIVIDE, LPAREN, RPAREN, WS]))

# Tokenizer
Token = collections.namedtuple('Token', ['type','value'])

def generate_tokens(text):
    scanner = master_pat.scanner(text)
    for m in iter(scanner.match, None):
        tok = Token(m.lastgroup, m.group())
        if tok.type != 'WS':
            yield tok

# Parser 
class ExpressionEvaluator:
    '''
    Implementation of a recursive descent parser.   Each method
    implements a single grammar rule.  Use the ._accept() method
    to test and accept the current lookahead token.  Use the ._expect()
    method to exactly match and discard the next token on on the input
    (or raise a SyntaxError if it doesn't match).
    '''

    def parse(self,text):
        self.tokens = generate_tokens(text)
        self.tok = None             # Last symbol consumed
        self.nexttok = None         # Next symbol tokenized
        self._advance()             # Load first lookahead token
        return self.expr()

    def _advance(self):
        'Advance one token ahead'
        self.tok, self.nexttok = self.nexttok, next(self.tokens, None)

    def _accept(self,toktype):
        'Test and consume the next token if it matches toktype'
        if self.nexttok and self.nexttok.type == toktype:
            self._advance()
            return True
        else:
            return False

    def _expect(self,toktype):
        'Consume next token if it matches toktype or raise SyntaxError'
        if not self._accept(toktype):
            raise SyntaxError('Expected ' + toktype)

    # Grammar rules follow

    def expr(self):
        "expression ::= term { ('+'|'-') term }*"

        exprval = self.term()
        while self._accept('PLUS') or self._accept('MINUS'):
            op = self.tok.type
            right = self.term()
            if op == 'PLUS':
                exprval += right
            elif op == 'MINUS':
                exprval -= right
        return exprval
    
    def term(self):
        "term ::= factor { ('*'|'/') factor }*"

        termval = self.factor()
        while self._accept('TIMES') or self._accept('DIVIDE'):
            op = self.tok.type
            right = self.factor()
            if op == 'TIMES':
                termval *= right
            elif op == 'DIVIDE':
                termval /= right
        return termval

    def factor(self):
        "factor ::= NUM | ( expr )"

        if self._accept('NUM'):
            return int(self.tok.value)
        elif self._accept('LPAREN'):
            exprval = self.expr()
            self._expect('RPAREN')
            return exprval
        else:
            raise SyntaxError('Expected NUMBER or LPAREN')

if __name__ == '__main__':
    e = ExpressionEvaluator()
    print(e.parse('2'))
    print(e.parse('2 + 3'))
    print(e.parse('2 + 3 * 4'))
    print(e.parse('2 + (3 + 4) * 5'))

# Example of building trees

class ExpressionTreeBuilder(ExpressionEvaluator):
    def expr(self):
        "expression ::= term { ('+'|'-') term }"

        exprval = self.term()
        while self._accept('PLUS') or self._accept('MINUS'):
            op = self.tok.type
            right = self.term()
            if op == 'PLUS':
                exprval = ('+', exprval, right)
            elif op == 'MINUS':
                exprval = ('-', exprval, right)
        return exprval
    
    def term(self):
        "term ::= factor { ('*'|'/') factor }"

        termval = self.factor()
        while self._accept('TIMES') or self._accept('DIVIDE'):
            op = self.tok.type
            right = self.factor()
            if op == 'TIMES':
                termval = ('*', termval, right)
            elif op == 'DIVIDE':
                termval = ('/', termval, right)
        return termval

    def factor(self):
        'factor ::= NUM | ( expr )'

        if self._accept('NUM'):
            return int(self.tok.value)
        elif self._accept('LPAREN'):
            exprval = self.expr()
            self._expect('RPAREN')
            return exprval
        else:
            raise SyntaxError('Expected NUMBER or LPAREN')

if __name__ == '__main__':
    e = ExpressionTreeBuilder()
    print(e.parse('2 + 3'))
    print(e.parse('2 + 3 * 4'))
    print(e.parse('2 + (3 + 4) * 5'))
    print(e.parse('2 + 3 + 4'))

tokenizer and parser returns wrong answer for a postfix notation

There are 1 best solutions below

Related Questions in PYTHON

Related Questions in PARSING

Related Questions in TOKENIZE

Related Questions in RECURSIVE-DESCENT

Trending Questions

Popular # Hahtags

Popular Questions