Quex: Defining UTF8 Regular expressions for Identifiers

220 Views Asked by At

I'm upgrading an Ecmascript engine that previously used Quex 0.64.8 to Quex 0.67.5. I have the lexer up and running and it seems it can only detect ANSI tokens now, and not UTF-8 tokens as before.

Essentially, what I do is to supply the --codec utf8 flag while running Quex and use the following code to identify identifiers:

PATTERN_IDSTART  [^0-9+\-<>*()\[\]?=&|~\\/\^%!{}\n\t\r"':;,. ]

PATTERN_IDPART  {PATTERN_IDSTART}|{PATTERN_DIGIT}

PATTERN_ID      {PATTERN_IDSTART}{PATTERN_IDPART}*

The idea is to rather than specifying all allowed tokens I define which are not acceptable instead and exclude those. The new lexer detects identifiers such as "test1" or "safari" just fine, but seems to have problems with "日本語" and "Örjan". I am also only using utf-8 and don't use either ICU or Iconv.

It feels like maybe I have misunderstood something here. Any help solving this problem would be much appreciated.

EDIT:

It might be useful to know I run Quex with the following arguments:

-i ${CMAKE_CURRENT_SOURCE_DIR}/ecmascript.qx
--analyzer-class ecmascript_lexer
--foreign-token-id-file ${BISON_ECMASCRIPT_PARSER_OUTPUT_HEADER}
--token-id-prefix TOK_ 
--template-compression 
--codec utf8 //--encoding utf8 since Quex 0.67.5
--buffer-element-size 1 
--buffer-element-type char
--odir ${CMAKE_CURRENT_BINARY_DIR}/generated
--language c++
--warning-on-outrun

EDIT 2:

I failed to re-create a small example since the utf-8 parsing worked in the example. Therefore I have created a standalone version of the lexer part of my ecmascript engine instead, with the hope that it would be easier to see what's going wrong.

It is no longer clear to me if my problem is actually related to parsing utf8 tokens. Quite possibly something may be wrong in my .qx file instead... either way, here is the standalone version of my ecmascript lexer.

CMakeLists.txt

cmake_minimum_required(VERSION 2.8)

project(ecmascript CXX)

if(MSVC)
    add_definitions(-D_CRT_SECURE_NO_WARNINGS)
endif()

set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)

set(QUEX_NAMES "quex")

if(CMAKE_HOST_WIN32)
    set(QUEX_NAMES "quex.bat" ${QUEX_NAMES})
else()
    set(QUEX_NAMES "quex-exe.py" ${QUEX_NAMES})
endif()

find_program(QUEX NAMES ${QUEX_NAMES} REQUIRED
             HINTS ENV QUEX_PATH DOC "Path to Quex's executable."
             NO_DEFAULT_PATH)

find_path(QUEX_INCLUDE_DIR quex/core.py REQUIRED
          HINTS ENV QUEX_PATH DOC "Path to Quex's include directory"
          NO_DEFAULT_PATH)

file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/generated)

add_definitions(-DQUEX_OPTION_LINE_NUMBER_COUNTING  
                -DQUEX_OPTION_ASSERTS_DISABLED)


add_definitions(-DQUEX_SETTING_BUFFER_SIZE=1024) # Sätter bufferstorleken på lexern

set(ECMASCRIPT_LEXER ${CMAKE_CURRENT_BINARY_DIR}/generated/ecmascript_lexer)

add_custom_command(OUTPUT ${ECMASCRIPT_LEXER} 
                          ${ECMASCRIPT_LEXER}.cpp
                          ${ECMASCRIPT_LEXER}-token 
                          ${ECMASCRIPT_LEXER}-configuration
                          ${ECMASCRIPT_LEXER}-token_ids
                          COMMAND ${QUEX} -i ${CMAKE_CURRENT_SOURCE_DIR}/ecmascript.qx
                                      --analyzer-class ecmascript_lexer # Namnet på lexern
                                      --foreign-token-id-file ${CMAKE_CURRENT_SOURCE_DIR}/ecmascript_yacc.hpp # token-id genereras av bison
                                      --token-id-prefix TOK_  # Custom prefix för tokens (se ecmascript.y för detaljer) 
                                      --template-compression  # Optimera lägesövergångar om möjligt 
                                      --encoding utf8         # Basera lexern på teckentabell UTF8
                                      --buffer-element-size 1 # Använd en datatyp som är 1 byte stor 
                                      --buffer-element-type uint8_t
                                      --odir ${CMAKE_CURRENT_BINARY_DIR}/generated  # Berätta var alla genererade filer ska ligga 
                                      --language c++
                     DEPENDS ecmascript.qx VERBATIM
                     COMMENT "Generating ecmascript lexer..."
                     MAIN_DEPENDENCY ${CMAKE_CURRENT_SOURCE_DIR}/ecmascript.qx) # Detta styr i vilken ordning Quex och Bison körs 

include_directories(${QUEX_INCLUDE_DIR} 
                    ${CMAKE_CURRENT_SOURCE_DIR}
                    ${CMAKE_CURRENT_BINARY_DIR}/generated)

set(es_lexer ${ECMASCRIPT_LEXER} 
             ${ECMASCRIPT_LEXER}.cpp
             _main.cpp)

set(es_generated ${es_lexer} ecmascript_yacc.hpp)

add_executable(es_lexer ${es_generated})               

ecmascript.qx

header {
    #include <quex/code_base/extra/accumulator/Accumulator>

    #include "ecmascript_yacc.hpp"
    #include <cstdlib>
    #include <cstdio>

    #define BACKSPACE           '\x08'
    #define TAB                 '\x09'
    #define NEWLINE             '\x0A'
    #define VERTICALTAB         '\x0B'
    #define FORMFEED            '\x0C'
    #define CARRIAGERETURN      '\x0D'
    #define DOUBLEQUOTE         '\x22'
    #define SINGLEQUOTE         '\x27'
    #define DOUBLEBACKSLASH     '\x5C'
    #define NULLTERM            '\x00'
}

footer {
    #include <quex/code_base/extra/accumulator/Accumulator.i>
}

define {
    PATTERN_NEWLINE [\n\r]

    PATTERN_DIGIT       [0-9]
    PATTERN_NOZDIGIT    [1-9]
    PATTERN_DECINTLIT   "0"|{PATTERN_NOZDIGIT}{PATTERN_DIGIT}*
    PATTERN_EXPIND      "e"|"E"
    PATTERN_SIGNEDINT   {PATTERN_DIGIT}+|"+"{PATTERN_DIGIT}+|"-"{PATTERN_DIGIT}+
    PATTERN_EXPPART     {PATTERN_EXPIND}{PATTERN_SIGNEDINT}

    PATTERN_DECNUMBER {PATTERN_DECINTLIT}"."{PATTERN_DIGIT}*{PATTERN_EXPPART}?|"."{PATTERN_DIGIT}+{PATTERN_EXPPART}?|{PATTERN_DECINTLIT}{PATTERN_EXPPART}?

    PATTERN_HEXDIGIT    [0-9a-fA-F]
    PATTERN_HEXNUMBER   "0x"{PATTERN_HEXDIGIT}+|"0X"{PATTERN_HEXDIGIT}+

    PATTERN_UNIESCSEQ   \\u{PATTERN_HEXDIGIT}{4}

    PATTERN_STRING      "\""(\\"\""|[^"])*"\""

    PATTERN_DOUBLE_QUOTE_STRING_DELIMITER "\""

    PATTERN_SINGLE_QUOTE_STRING_DELIMITER "'"

    PATTERN_SINGLELINE_COMMENT "//"[^\n\r]*

    PATTERN_IDSTART  [^0-9+\-<>*()\[\]?=&|~\\/\^%!{}\n\t\r"':;,. ]

    PATTERN_IDPART  {PATTERN_IDSTART}|{PATTERN_DIGIT}

    PATTERN_ID      {PATTERN_IDSTART}{PATTERN_IDPART}*      
}

mode EOF : <inheritable: only>  {    
    on_end_of_stream {
        self_send(TOK_LINETERM);  

        self_send(TOK_TERMINATION);
    }
}

mode RestrictedProduction : EOF
                            <skip: [ \t]>
{
    {PATTERN_NEWLINE}{
        self_send(';');
        self << Program;
    }

    on_failure {
        self.undo();
        self << Program;
    }
}

mode StringHelper : EOF
                    <inheritable: only>
{
    on_entry {
        self_send(TOK_QUOTE);
    }

    on_exit { 
        if(self.accumulator.text.begin != self.accumulator.text.end)
            self_send(TOK_STRLITPART);

        self_accumulator_flush(TOK_QUOTE);
    }

    {PATTERN_NEWLINE} => '\n';

    "\\b" { self_accumulator_add_character(BACKSPACE); }

    "\\t" { self_accumulator_add_character(TAB); }



    "\\n" { self_accumulator_add_character(NEWLINE); }

    "\\v" { self_accumulator_add_character(VERTICALTAB); }

    "\\f" { self_accumulator_add_character(FORMFEED); }

    "\\r" { self_accumulator_add_character(CARRIAGERETURN); }

    "\\\"" { self_accumulator_add_character(DOUBLEQUOTE); }

    "\\'" { self_accumulator_add_character(SINGLEQUOTE); }

    "\\\\" { self_accumulator_add_character(DOUBLEBACKSLASH); } 

    "\\0" { self_accumulator_add_character(NULLTERM); }

    "\\x"{PATTERN_HEXDIGIT}{2}
    {
        {
            unsigned long ulResult = strtoul(reinterpret_cast<char*>(Lexeme+2),0,16);
            uint8_t *const pBuffer = reinterpret_cast<uint8_t*>(&ulResult);
            self_accumulator_add(pBuffer,pBuffer+2);            
        }
    }

    on_failure {
        self_accumulator_add(Lexeme, LexemeEnd); 
    }
}

mode SingleQuoteString : StringHelper
{
    {PATTERN_SINGLE_QUOTE_STRING_DELIMITER}
    { 
        // Om vi hittade slutet på strängen så växlar vi tillbaka till Program-läget
        self << Program; 
    }
}

mode DoubleQuoteString : StringHelper
{
    {PATTERN_DOUBLE_QUOTE_STRING_DELIMITER}
    { 
        // Om vi hittade slutet på strängen så växlar vi tillbaka till Program-läget
        self << Program; 
    }
}

mode PrefixHelper : EOF
                    <skip: [ \t]>   // Ignorera whitespace
{
    on_entry {
        self.seek_backward(3);
    }

    {PATTERN_NEWLINE} 
    {
        if(self.iParaCount == 0)
            self_send(';'); 
    }

    "++"
    {
        self_send(TOK_PLUSPLUS);
        self << Program;
    }

    "--"
    {   
        self_send(TOK_MINUSMINUS);
        self << Program;
    }

    on_failure {
        (void)Lexeme;
    }
}

mode Operators : <inheritable: only>
{
    "||"        => TOK_OR;
    "&&"        => TOK_AND; 
    "++"        { self << PrefixHelper; }
    "--"        { self << PrefixHelper; } 
    "==="       => TOK_EQEQEQ; 
    "=="        => TOK_EQEQ; 
    "!=="       => TOK_NEQEQ; 
    "!="        => TOK_NEQ;  
    "*="        => TOK_MULTEQ; 
    "/="        => TOK_DIVEQ;
    "%="        => TOK_MODEQ; 
    "+="        => TOK_PLUSEQ; 
    "\-="       => TOK_MINUSEQ;
    ">>>="      => TOK_GTGTGTEQ;
    ">>>"       => TOK_GTGTGT;   
    "<<="       => TOK_LTLTEQ; 
    ">>="       => TOK_GTGTEQ;
    "<<"        => TOK_LTLT; 
    ">>"        => TOK_GTGT;
    "<="        => TOK_LTE; 
    ">="        => TOK_GTE;  
    "&="        => TOK_AMPEQ; 
    "^="        => TOK_CIRCEQ; 
    "|="        => TOK_PIPEEQ; 

    ['=']       => '='; 
    ['!']       => '!'; 
    ['(']       { self_send('('); ++self.iParaCount; }
    ['+']       => '+';
    ['\-']      => '-';
    ['*']       => '*';
    ['/']       => '/';
    ['%']       => '%';
    ['<']       => '<';
    ['>']       => '>'; 
    ['\[']      => '[';
    ['\]']      => ']';
    ['.']       => '.';
    [',']       => ',';
    ['?']       => '?';
    [':']       => ':';
    ['~']       => '~';
    ['&']       => '&';
    ['^']       => '^';
    ['|']       => '|';
    ['{']       => '{';
    [';']       => ';';
    [')']       { self_send(')'); --self.iParaCount; }
    ['}']       { self_send(TOK_LINETERM); self_send('}'); }
}

mode Keywords : <inheritable: only>
{
    function        => TOK_FUNCTION; 
    return          { self_send(TOK_RETURN); self << RestrictedProduction; } 
    var             => TOK_VAR; 
    null            => TOK_NULL; 
    true            => TOK_TRUE; 
    false           => TOK_FALSE; 
    instanceof      => TOK_INSTANCEOF; 
    in              => TOK_IN; 
    delete          => TOK_DELETE; 
    void            => TOK_VOID; 
    typeof          => TOK_TYPEOF; 
    this            => TOK_THIS; 
    if              => TOK_IF; 
    else            => TOK_ELSE; 
    with            => TOK_WITH; 
    throw           { self_send(TOK_THROW); self << RestrictedProduction; } 
    try             => TOK_TRY; 
    catch           => TOK_CATCH; 
    finally         => TOK_FINALLY; 
    for             => TOK_FOR; 
    break           { self_send(TOK_BREAK); self << RestrictedProduction; } 
    continue        { self_send(TOK_CONTINUE); self << RestrictedProduction; } 
    while           => TOK_WHILE; 
    do              => TOK_DO; 
    switch          => TOK_SWITCH; 
    case            => TOK_CASE; 
    default         => TOK_DEFAULT; 
    new             => TOK_NEW; 
    synchronized    => TOK_SYNCHRONIZED; 
}

mode Values :  <inheritable: only>
{
    {PATTERN_DECNUMBER} => TOK_DECLIT(Lexeme);

    {PATTERN_HEXNUMBER} => TOK_HEXINTLIT(Lexeme);

    {PATTERN_DOUBLE_QUOTE_STRING_DELIMITER} { self << DoubleQuoteString; }

    {PATTERN_SINGLE_QUOTE_STRING_DELIMITER} { self << SingleQuoteString; }  
}

mode Identifiers : <inheritable: only>
{   
    {PATTERN_ID} => TOK_ID(Lexeme);
}

mode Program : Keywords,
               Identifiers,
               Values,
               Operators,
               EOF              
               <skip: [ \t]>
               <skip_range: "/*" "*/">
{
    {PATTERN_NEWLINE}
    {
        if(self.iParaCount == 0)
            self_send(TOK_LINETERM);
    }

    {PATTERN_SINGLELINE_COMMENT}
    {} 
}

body {

    void push_token(const unsigned int uiToken)
    {
        self.uiLastToken = self.uiCurrentToken;
        self.uiCurrentToken = uiToken;
    }

    bool use_auto_semi() const
    { return uiLastToken == TOK_LINETERM; }

    unsigned int uiLastToken,
                 uiCurrentToken;

    int iParaCount;

    quex::Token* pLastID;

    QUEX_NAME(Accumulator) accumulator;
}

constructor {
    self.uiLastToken = 0;
    self.uiCurrentToken = 0;
    self.iParaCount = 0;
    self.pLastID = 0;

    if(!QUEX_NAME(Accumulator_construct)(&me->accumulator, me)) {
        return false;
    }   
}

destructor {
    QUEX_NAME(Accumulator_destruct)(&me->accumulator);
}

start = Program;

ecmascript_yacc.hpp

   #ifndef YY_ECMASCRIPT_YY_C_USERS_PATRIKJ_WORK_GIT_ECMASCRIPT_BUILD_VC14_X64_GENERATED_ECMASCRIPT_YACC_HPP_INCLUDED
# define YY_ECMASCRIPT_YY_C_USERS_PATRIKJ_WORK_GIT_ECMASCRIPT_BUILD_VC14_X64_GENERATED_ECMASCRIPT_YACC_HPP_INCLUDED

/* Token type.  */
#ifndef YYTOKENTYPE
# define YYTOKENTYPE
  enum yytokentype
  {
    TOK_TERMINATION = 0,
    TOK_UNINITIALIZED = 1,
    TOK_ID = 258,
    TOK_NULL = 259,
    TOK_TRUE = 260,
    TOK_FALSE = 261,
    TOK_DECLIT = 262,
    TOK_HEXINTLIT = 263,
    TOK_OR = 264,
    TOK_AND = 265,
    TOK_PLUSPLUS = 266,
    TOK_MINUSMINUS = 267,
    TOK_EQEQ = 268,
    TOK_NEQ = 269,
    TOK_EQEQEQ = 270,
    TOK_NEQEQ = 271,
    TOK_LTE = 272,
    TOK_GTE = 273,
    TOK_INSTANCEOF = 274,
    TOK_IN = 275,
    TOK_STRLITPART = 276,
    TOK_QUOTE = 277,
    TOK_VOID = 278,
    TOK_TYPEOF = 279,
    TOK_DELETE = 280,
    TOK_THIS = 281,
    TOK_LTLT = 282,
    TOK_GTGT = 283,
    TOK_GTGTGT = 284,
    TOK_MULTEQ = 285,
    TOK_DIVEQ = 286,
    TOK_MODEQ = 287,
    TOK_PLUSEQ = 288,
    TOK_MINUSEQ = 289,
    TOK_LTLTEQ = 290,
    TOK_GTGTEQ = 291,
    TOK_GTGTGTEQ = 292,
    TOK_AMPEQ = 293,
    TOK_CIRCEQ = 294,
    TOK_PIPEEQ = 295,
    TOK_IF = 296,
    TOK_ELSE = 297,
    TOK_RETURN = 298,
    TOK_VAR = 299,
    TOK_WITH = 300,
    TOK_THROW = 301,
    TOK_TRY = 302,
    TOK_CATCH = 303,
    TOK_FINALLY = 304,
    TOK_FOR = 305,
    TOK_BREAK = 306,
    TOK_CONTINUE = 307,
    TOK_WHILE = 308,
    TOK_DO = 309,
    TOK_SWITCH = 310,
    TOK_CASE = 311,
    TOK_DEFAULT = 312,
    TOK_NEW = 313,
    TOK_FUNCTION = 314,
    TOK_SYNCHRONIZED = 315,
    TOK_LINETERM = 316
  };
#endif

/* Value type.  */
#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
typedef int YYSTYPE;
# define YYSTYPE_IS_TRIVIAL 1
# define YYSTYPE_IS_DECLARED 1
#endif

#endif /* !YY_ECMASCRIPT_YY_C_USERS_PATRIKJ_WORK_GIT_ECMASCRIPT_BUILD_VC14_X64_GENERATED_ECMASCRIPT_YACC_HPP_INCLUDED  */

_main.cpp

#include <iostream>

#include "ecmascript_lexer"

/****************************************************************************************/
void print_token(quex::Token* token)
{
    std::cout << token->get_string() << std::endl;
}

/****************************************************************************************/
int main(int argc, char** argv)
{
    quex::Token*     token = 0;
    quex::ecmascript_lexer  qlex;

    quex::ecmascript_lexer *lexer = quex::ecmascript_lexer::from_file_name("id_test.js", 0);

    while(lexer->error_code == E_Error_None) 
    {
        get_token:

        lexer->receive(&token);

        if(!token)
            break;

        print_token(token);

        lexer->push_token(token->type_id());

        if(token->type_id() == TOK_LINETERM)
            goto get_token;

        if(token->type_id() == TOK_ID)
            lexer->pLastID = token;

        if(token->type_id() == TOK_TERMINATION)
            break;
    }

    delete lexer;
    return 0;
}

id_test.js // Used for testing the lexer

test1 = safari;
myFunc()

function t(){}

if(test1 < 23)
    return myFunc(45);

myFunc();

svenskaåäö();

var kalleö = 34;

var _us=kalleö;

_us = 678

日本語 = "Nihongo" // Japanska språkets namn 

$myCar = _us

var new1 = kalleö ? t(); 

"kalleÖ, _us and $myCar should be ignored here"

 الفصحى = "Arabiska"

/*
    var new1 = kalleÖ ? t(); 

    "kalleÖ, _us and $myCar should be ignored here"
*/

// var new1 = kalleÖ ? t(); 
대한민국 = 45; 
대한민국X45 = "Value of: 대한민국" + 대한민국; 

ärta="ärta + 2"

mix帝With대한민국 = "success?"; 

Örjan;

önes;
cake;

Россия; 
РоссияX;
РоссияX
XРоссия;
XРоссия;

始皇帝 = "The First emperor"
始皇帝x2 = "The First emperor, twice?"

Best regards,

Patrik J

1

There are 1 best solutions below

15
On

I would suggest you rely on Unicode Properties, in particular ID_Start and ID_Continue, such that your .qx file contains

define {
    ID_START    \P{ID_Start} 
    ID_CONTINUE \P{ID_Continue}
    ID          {ID_START}{ID_CONTINUE}*
}

Quex then samples the UCS database and you won't have to worry about the particular code points.

Also, if you only want to support a subset, use intersection to cut out the desired UCS range, as in the following example:

...
    ID_START    [: intersection([\X900-\X970], \P{ID_Start}) :]
    ID_CONTINUE [: intersection([\X900-\X970], \P{ID_Continue}) :]
...

PS, your solution is not totally wrong. Given a file example.qx:

define {
    PATTERN_IDSTART  [^0-9+\-<>*()\[\]?=&|~\\/\^%!{}\n\t\r"':;,. ]
    PATTERN_IDPART  {PATTERN_IDSTART}|[0-9]
    PATTERN_ID      {PATTERN_IDSTART}{PATTERN_IDPART}*
}

token { ID; WS; }

mode X { 
    {PATTERN_ID} => QUEX_TKN_ID(Lexeme); 
    [ \n\t]      => QUEX_TKN_WS(Lexeme); 
}

And some user file 'example.c':

#include <stdio.h>

#include "EasyLexer.h"

void
print_token(quex_Token* token_p)
{   
    const size_t    BufferSize = 1024;
    char            buffer[1024];
    printf("%s \n", QUEX_NAME_TOKEN(get_string)(token_p, buffer, BufferSize));
}

int
main(int argc, char** argv)
{   
    quex_Token*     token_p = NULL;
    quex_EasyLexer  qlex;

    quex_EasyLexer_from_file_name(&qlex, "example.txt", NULL);

    while( qlex.error_code == E_Error_None ) {
        quex_EasyLexer_receive(&qlex, &token_p);
        if( ! token_p ) break;

        print_token(token_p);
        if( token_p->_id == QUEX_TKN_TERMINATION ) break;
    }

    quex_EasyLexer_destruct(&qlex);
    return 0;
}

then executing on the command line:

> quex -i tmp.qx --encoding utf8 --language C -o EasyLexer
> gcc -I$QUEX_PATH example.c EasyLexer.c -o example
> ./example example.txt

delivers

ID 'Örjan' 
WS '\n' 
ID '日本語' 
WS '\n'

given that the file 'example.txt' is UTF-8 encoded and contains

Örjan
日本語

I do not know what to say else. Am I understanding something wrong?