I'm upgrading an Ecmascript engine that previously used
Quex 0.64.8
to Quex 0.67.5
. I have the lexer up and
running and it seems it can only detect ANSI tokens now,
and not UTF-8 tokens as before.
Essentially, what I do is to supply the --codec utf8
flag while
running Quex and use the following code to identify identifiers:
PATTERN_IDSTART [^0-9+\-<>*()\[\]?=&|~\\/\^%!{}\n\t\r"':;,. ]
PATTERN_IDPART {PATTERN_IDSTART}|{PATTERN_DIGIT}
PATTERN_ID {PATTERN_IDSTART}{PATTERN_IDPART}*
The idea is to rather than specifying all allowed tokens I define which are not acceptable instead and exclude those. The new lexer detects identifiers such as "test1" or "safari" just fine, but seems to have problems with "日本語" and "Örjan". I am also only using utf-8 and don't use either ICU or Iconv.
It feels like maybe I have misunderstood something here. Any help solving this problem would be much appreciated.
EDIT:
It might be useful to know I run Quex with the following arguments:
-i ${CMAKE_CURRENT_SOURCE_DIR}/ecmascript.qx
--analyzer-class ecmascript_lexer
--foreign-token-id-file ${BISON_ECMASCRIPT_PARSER_OUTPUT_HEADER}
--token-id-prefix TOK_
--template-compression
--codec utf8 //--encoding utf8 since Quex 0.67.5
--buffer-element-size 1
--buffer-element-type char
--odir ${CMAKE_CURRENT_BINARY_DIR}/generated
--language c++
--warning-on-outrun
EDIT 2:
I failed to re-create a small example since the utf-8 parsing worked in the example. Therefore I have created a standalone version of the lexer part of my ecmascript engine instead, with the hope that it would be easier to see what's going wrong.
It is no longer clear to me if my problem is actually related to parsing utf8 tokens. Quite possibly something may be wrong in my .qx file instead... either way, here is the standalone version of my ecmascript lexer.
CMakeLists.txt
cmake_minimum_required(VERSION 2.8)
project(ecmascript CXX)
if(MSVC)
add_definitions(-D_CRT_SECURE_NO_WARNINGS)
endif()
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
set(QUEX_NAMES "quex")
if(CMAKE_HOST_WIN32)
set(QUEX_NAMES "quex.bat" ${QUEX_NAMES})
else()
set(QUEX_NAMES "quex-exe.py" ${QUEX_NAMES})
endif()
find_program(QUEX NAMES ${QUEX_NAMES} REQUIRED
HINTS ENV QUEX_PATH DOC "Path to Quex's executable."
NO_DEFAULT_PATH)
find_path(QUEX_INCLUDE_DIR quex/core.py REQUIRED
HINTS ENV QUEX_PATH DOC "Path to Quex's include directory"
NO_DEFAULT_PATH)
file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/generated)
add_definitions(-DQUEX_OPTION_LINE_NUMBER_COUNTING
-DQUEX_OPTION_ASSERTS_DISABLED)
add_definitions(-DQUEX_SETTING_BUFFER_SIZE=1024) # Sätter bufferstorleken på lexern
set(ECMASCRIPT_LEXER ${CMAKE_CURRENT_BINARY_DIR}/generated/ecmascript_lexer)
add_custom_command(OUTPUT ${ECMASCRIPT_LEXER}
${ECMASCRIPT_LEXER}.cpp
${ECMASCRIPT_LEXER}-token
${ECMASCRIPT_LEXER}-configuration
${ECMASCRIPT_LEXER}-token_ids
COMMAND ${QUEX} -i ${CMAKE_CURRENT_SOURCE_DIR}/ecmascript.qx
--analyzer-class ecmascript_lexer # Namnet på lexern
--foreign-token-id-file ${CMAKE_CURRENT_SOURCE_DIR}/ecmascript_yacc.hpp # token-id genereras av bison
--token-id-prefix TOK_ # Custom prefix för tokens (se ecmascript.y för detaljer)
--template-compression # Optimera lägesövergångar om möjligt
--encoding utf8 # Basera lexern på teckentabell UTF8
--buffer-element-size 1 # Använd en datatyp som är 1 byte stor
--buffer-element-type uint8_t
--odir ${CMAKE_CURRENT_BINARY_DIR}/generated # Berätta var alla genererade filer ska ligga
--language c++
DEPENDS ecmascript.qx VERBATIM
COMMENT "Generating ecmascript lexer..."
MAIN_DEPENDENCY ${CMAKE_CURRENT_SOURCE_DIR}/ecmascript.qx) # Detta styr i vilken ordning Quex och Bison körs
include_directories(${QUEX_INCLUDE_DIR}
${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_CURRENT_BINARY_DIR}/generated)
set(es_lexer ${ECMASCRIPT_LEXER}
${ECMASCRIPT_LEXER}.cpp
_main.cpp)
set(es_generated ${es_lexer} ecmascript_yacc.hpp)
add_executable(es_lexer ${es_generated})
ecmascript.qx
header {
#include <quex/code_base/extra/accumulator/Accumulator>
#include "ecmascript_yacc.hpp"
#include <cstdlib>
#include <cstdio>
#define BACKSPACE '\x08'
#define TAB '\x09'
#define NEWLINE '\x0A'
#define VERTICALTAB '\x0B'
#define FORMFEED '\x0C'
#define CARRIAGERETURN '\x0D'
#define DOUBLEQUOTE '\x22'
#define SINGLEQUOTE '\x27'
#define DOUBLEBACKSLASH '\x5C'
#define NULLTERM '\x00'
}
footer {
#include <quex/code_base/extra/accumulator/Accumulator.i>
}
define {
PATTERN_NEWLINE [\n\r]
PATTERN_DIGIT [0-9]
PATTERN_NOZDIGIT [1-9]
PATTERN_DECINTLIT "0"|{PATTERN_NOZDIGIT}{PATTERN_DIGIT}*
PATTERN_EXPIND "e"|"E"
PATTERN_SIGNEDINT {PATTERN_DIGIT}+|"+"{PATTERN_DIGIT}+|"-"{PATTERN_DIGIT}+
PATTERN_EXPPART {PATTERN_EXPIND}{PATTERN_SIGNEDINT}
PATTERN_DECNUMBER {PATTERN_DECINTLIT}"."{PATTERN_DIGIT}*{PATTERN_EXPPART}?|"."{PATTERN_DIGIT}+{PATTERN_EXPPART}?|{PATTERN_DECINTLIT}{PATTERN_EXPPART}?
PATTERN_HEXDIGIT [0-9a-fA-F]
PATTERN_HEXNUMBER "0x"{PATTERN_HEXDIGIT}+|"0X"{PATTERN_HEXDIGIT}+
PATTERN_UNIESCSEQ \\u{PATTERN_HEXDIGIT}{4}
PATTERN_STRING "\""(\\"\""|[^"])*"\""
PATTERN_DOUBLE_QUOTE_STRING_DELIMITER "\""
PATTERN_SINGLE_QUOTE_STRING_DELIMITER "'"
PATTERN_SINGLELINE_COMMENT "//"[^\n\r]*
PATTERN_IDSTART [^0-9+\-<>*()\[\]?=&|~\\/\^%!{}\n\t\r"':;,. ]
PATTERN_IDPART {PATTERN_IDSTART}|{PATTERN_DIGIT}
PATTERN_ID {PATTERN_IDSTART}{PATTERN_IDPART}*
}
mode EOF : <inheritable: only> {
on_end_of_stream {
self_send(TOK_LINETERM);
self_send(TOK_TERMINATION);
}
}
mode RestrictedProduction : EOF
<skip: [ \t]>
{
{PATTERN_NEWLINE}{
self_send(';');
self << Program;
}
on_failure {
self.undo();
self << Program;
}
}
mode StringHelper : EOF
<inheritable: only>
{
on_entry {
self_send(TOK_QUOTE);
}
on_exit {
if(self.accumulator.text.begin != self.accumulator.text.end)
self_send(TOK_STRLITPART);
self_accumulator_flush(TOK_QUOTE);
}
{PATTERN_NEWLINE} => '\n';
"\\b" { self_accumulator_add_character(BACKSPACE); }
"\\t" { self_accumulator_add_character(TAB); }
"\\n" { self_accumulator_add_character(NEWLINE); }
"\\v" { self_accumulator_add_character(VERTICALTAB); }
"\\f" { self_accumulator_add_character(FORMFEED); }
"\\r" { self_accumulator_add_character(CARRIAGERETURN); }
"\\\"" { self_accumulator_add_character(DOUBLEQUOTE); }
"\\'" { self_accumulator_add_character(SINGLEQUOTE); }
"\\\\" { self_accumulator_add_character(DOUBLEBACKSLASH); }
"\\0" { self_accumulator_add_character(NULLTERM); }
"\\x"{PATTERN_HEXDIGIT}{2}
{
{
unsigned long ulResult = strtoul(reinterpret_cast<char*>(Lexeme+2),0,16);
uint8_t *const pBuffer = reinterpret_cast<uint8_t*>(&ulResult);
self_accumulator_add(pBuffer,pBuffer+2);
}
}
on_failure {
self_accumulator_add(Lexeme, LexemeEnd);
}
}
mode SingleQuoteString : StringHelper
{
{PATTERN_SINGLE_QUOTE_STRING_DELIMITER}
{
// Om vi hittade slutet på strängen så växlar vi tillbaka till Program-läget
self << Program;
}
}
mode DoubleQuoteString : StringHelper
{
{PATTERN_DOUBLE_QUOTE_STRING_DELIMITER}
{
// Om vi hittade slutet på strängen så växlar vi tillbaka till Program-läget
self << Program;
}
}
mode PrefixHelper : EOF
<skip: [ \t]> // Ignorera whitespace
{
on_entry {
self.seek_backward(3);
}
{PATTERN_NEWLINE}
{
if(self.iParaCount == 0)
self_send(';');
}
"++"
{
self_send(TOK_PLUSPLUS);
self << Program;
}
"--"
{
self_send(TOK_MINUSMINUS);
self << Program;
}
on_failure {
(void)Lexeme;
}
}
mode Operators : <inheritable: only>
{
"||" => TOK_OR;
"&&" => TOK_AND;
"++" { self << PrefixHelper; }
"--" { self << PrefixHelper; }
"===" => TOK_EQEQEQ;
"==" => TOK_EQEQ;
"!==" => TOK_NEQEQ;
"!=" => TOK_NEQ;
"*=" => TOK_MULTEQ;
"/=" => TOK_DIVEQ;
"%=" => TOK_MODEQ;
"+=" => TOK_PLUSEQ;
"\-=" => TOK_MINUSEQ;
">>>=" => TOK_GTGTGTEQ;
">>>" => TOK_GTGTGT;
"<<=" => TOK_LTLTEQ;
">>=" => TOK_GTGTEQ;
"<<" => TOK_LTLT;
">>" => TOK_GTGT;
"<=" => TOK_LTE;
">=" => TOK_GTE;
"&=" => TOK_AMPEQ;
"^=" => TOK_CIRCEQ;
"|=" => TOK_PIPEEQ;
['='] => '=';
['!'] => '!';
['('] { self_send('('); ++self.iParaCount; }
['+'] => '+';
['\-'] => '-';
['*'] => '*';
['/'] => '/';
['%'] => '%';
['<'] => '<';
['>'] => '>';
['\['] => '[';
['\]'] => ']';
['.'] => '.';
[','] => ',';
['?'] => '?';
[':'] => ':';
['~'] => '~';
['&'] => '&';
['^'] => '^';
['|'] => '|';
['{'] => '{';
[';'] => ';';
[')'] { self_send(')'); --self.iParaCount; }
['}'] { self_send(TOK_LINETERM); self_send('}'); }
}
mode Keywords : <inheritable: only>
{
function => TOK_FUNCTION;
return { self_send(TOK_RETURN); self << RestrictedProduction; }
var => TOK_VAR;
null => TOK_NULL;
true => TOK_TRUE;
false => TOK_FALSE;
instanceof => TOK_INSTANCEOF;
in => TOK_IN;
delete => TOK_DELETE;
void => TOK_VOID;
typeof => TOK_TYPEOF;
this => TOK_THIS;
if => TOK_IF;
else => TOK_ELSE;
with => TOK_WITH;
throw { self_send(TOK_THROW); self << RestrictedProduction; }
try => TOK_TRY;
catch => TOK_CATCH;
finally => TOK_FINALLY;
for => TOK_FOR;
break { self_send(TOK_BREAK); self << RestrictedProduction; }
continue { self_send(TOK_CONTINUE); self << RestrictedProduction; }
while => TOK_WHILE;
do => TOK_DO;
switch => TOK_SWITCH;
case => TOK_CASE;
default => TOK_DEFAULT;
new => TOK_NEW;
synchronized => TOK_SYNCHRONIZED;
}
mode Values : <inheritable: only>
{
{PATTERN_DECNUMBER} => TOK_DECLIT(Lexeme);
{PATTERN_HEXNUMBER} => TOK_HEXINTLIT(Lexeme);
{PATTERN_DOUBLE_QUOTE_STRING_DELIMITER} { self << DoubleQuoteString; }
{PATTERN_SINGLE_QUOTE_STRING_DELIMITER} { self << SingleQuoteString; }
}
mode Identifiers : <inheritable: only>
{
{PATTERN_ID} => TOK_ID(Lexeme);
}
mode Program : Keywords,
Identifiers,
Values,
Operators,
EOF
<skip: [ \t]>
<skip_range: "/*" "*/">
{
{PATTERN_NEWLINE}
{
if(self.iParaCount == 0)
self_send(TOK_LINETERM);
}
{PATTERN_SINGLELINE_COMMENT}
{}
}
body {
void push_token(const unsigned int uiToken)
{
self.uiLastToken = self.uiCurrentToken;
self.uiCurrentToken = uiToken;
}
bool use_auto_semi() const
{ return uiLastToken == TOK_LINETERM; }
unsigned int uiLastToken,
uiCurrentToken;
int iParaCount;
quex::Token* pLastID;
QUEX_NAME(Accumulator) accumulator;
}
constructor {
self.uiLastToken = 0;
self.uiCurrentToken = 0;
self.iParaCount = 0;
self.pLastID = 0;
if(!QUEX_NAME(Accumulator_construct)(&me->accumulator, me)) {
return false;
}
}
destructor {
QUEX_NAME(Accumulator_destruct)(&me->accumulator);
}
start = Program;
ecmascript_yacc.hpp
#ifndef YY_ECMASCRIPT_YY_C_USERS_PATRIKJ_WORK_GIT_ECMASCRIPT_BUILD_VC14_X64_GENERATED_ECMASCRIPT_YACC_HPP_INCLUDED
# define YY_ECMASCRIPT_YY_C_USERS_PATRIKJ_WORK_GIT_ECMASCRIPT_BUILD_VC14_X64_GENERATED_ECMASCRIPT_YACC_HPP_INCLUDED
/* Token type. */
#ifndef YYTOKENTYPE
# define YYTOKENTYPE
enum yytokentype
{
TOK_TERMINATION = 0,
TOK_UNINITIALIZED = 1,
TOK_ID = 258,
TOK_NULL = 259,
TOK_TRUE = 260,
TOK_FALSE = 261,
TOK_DECLIT = 262,
TOK_HEXINTLIT = 263,
TOK_OR = 264,
TOK_AND = 265,
TOK_PLUSPLUS = 266,
TOK_MINUSMINUS = 267,
TOK_EQEQ = 268,
TOK_NEQ = 269,
TOK_EQEQEQ = 270,
TOK_NEQEQ = 271,
TOK_LTE = 272,
TOK_GTE = 273,
TOK_INSTANCEOF = 274,
TOK_IN = 275,
TOK_STRLITPART = 276,
TOK_QUOTE = 277,
TOK_VOID = 278,
TOK_TYPEOF = 279,
TOK_DELETE = 280,
TOK_THIS = 281,
TOK_LTLT = 282,
TOK_GTGT = 283,
TOK_GTGTGT = 284,
TOK_MULTEQ = 285,
TOK_DIVEQ = 286,
TOK_MODEQ = 287,
TOK_PLUSEQ = 288,
TOK_MINUSEQ = 289,
TOK_LTLTEQ = 290,
TOK_GTGTEQ = 291,
TOK_GTGTGTEQ = 292,
TOK_AMPEQ = 293,
TOK_CIRCEQ = 294,
TOK_PIPEEQ = 295,
TOK_IF = 296,
TOK_ELSE = 297,
TOK_RETURN = 298,
TOK_VAR = 299,
TOK_WITH = 300,
TOK_THROW = 301,
TOK_TRY = 302,
TOK_CATCH = 303,
TOK_FINALLY = 304,
TOK_FOR = 305,
TOK_BREAK = 306,
TOK_CONTINUE = 307,
TOK_WHILE = 308,
TOK_DO = 309,
TOK_SWITCH = 310,
TOK_CASE = 311,
TOK_DEFAULT = 312,
TOK_NEW = 313,
TOK_FUNCTION = 314,
TOK_SYNCHRONIZED = 315,
TOK_LINETERM = 316
};
#endif
/* Value type. */
#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
typedef int YYSTYPE;
# define YYSTYPE_IS_TRIVIAL 1
# define YYSTYPE_IS_DECLARED 1
#endif
#endif /* !YY_ECMASCRIPT_YY_C_USERS_PATRIKJ_WORK_GIT_ECMASCRIPT_BUILD_VC14_X64_GENERATED_ECMASCRIPT_YACC_HPP_INCLUDED */
_main.cpp
#include <iostream>
#include "ecmascript_lexer"
/****************************************************************************************/
void print_token(quex::Token* token)
{
std::cout << token->get_string() << std::endl;
}
/****************************************************************************************/
int main(int argc, char** argv)
{
quex::Token* token = 0;
quex::ecmascript_lexer qlex;
quex::ecmascript_lexer *lexer = quex::ecmascript_lexer::from_file_name("id_test.js", 0);
while(lexer->error_code == E_Error_None)
{
get_token:
lexer->receive(&token);
if(!token)
break;
print_token(token);
lexer->push_token(token->type_id());
if(token->type_id() == TOK_LINETERM)
goto get_token;
if(token->type_id() == TOK_ID)
lexer->pLastID = token;
if(token->type_id() == TOK_TERMINATION)
break;
}
delete lexer;
return 0;
}
id_test.js // Used for testing the lexer
test1 = safari;
myFunc()
function t(){}
if(test1 < 23)
return myFunc(45);
myFunc();
svenskaåäö();
var kalleö = 34;
var _us=kalleö;
_us = 678
日本語 = "Nihongo" // Japanska språkets namn
$myCar = _us
var new1 = kalleö ? t();
"kalleÖ, _us and $myCar should be ignored here"
الفصحى = "Arabiska"
/*
var new1 = kalleÖ ? t();
"kalleÖ, _us and $myCar should be ignored here"
*/
// var new1 = kalleÖ ? t();
대한민국 = 45;
대한민국X45 = "Value of: 대한민국" + 대한민국;
ärta="ärta + 2"
mix帝With대한민국 = "success?";
Örjan;
önes;
cake;
Россия;
РоссияX;
РоссияX
XРоссия;
XРоссия;
始皇帝 = "The First emperor"
始皇帝x2 = "The First emperor, twice?"
Best regards,
Patrik J
I would suggest you rely on Unicode Properties, in particular
ID_Start
andID_Continue
, such that your .qx file containsQuex then samples the UCS database and you won't have to worry about the particular code points.
Also, if you only want to support a subset, use
intersection
to cut out the desired UCS range, as in the following example:PS, your solution is not totally wrong. Given a file example.qx:
And some user file 'example.c':
then executing on the command line:
delivers
given that the file 'example.txt' is UTF-8 encoded and contains
I do not know what to say else. Am I understanding something wrong?