fix(frontends/lean/scanner): decode utf8

Signed-off-by: Leonardo de Moura <leonardo@microsoft.com>
This commit is contained in:
Leonardo de Moura 2014-08-01 19:58:02 -07:00
parent 4b604521a0
commit df57043861
2 changed files with 143 additions and 69 deletions

View file

@ -11,6 +11,27 @@ Author: Leonardo de Moura
#include "frontends/lean/parser_config.h"
namespace lean {
bool is_utf8_next(unsigned char c) { return (c & 0xC0) == 0x80; }
unsigned scanner::get_utf8_size(unsigned char c) {
if ((c & 0x80) == 0)
return 1;
else if ((c & 0xE0) == 0xC0)
return 2;
else if ((c & 0xF0) == 0xE0)
return 3;
else if ((c & 0xF8) == 0xF0)
return 4;
else if ((c && 0xFC) == 0xF8)
return 5;
else if ((c && 0xFE) == 0xFC)
return 6;
else if (c == 0xFF)
return 1;
else
throw_exception("invalid utf-8 head character");
}
void scanner::set_line(unsigned p) {
m_line = p;
m_sline = p;
@ -20,6 +41,16 @@ void scanner::next() {
lean_assert(m_curr != EOF);
m_curr = m_stream.get();
m_spos++;
if (m_uskip > 0) {
if (!is_utf8_next(m_curr)) {
throw_exception("invalid utf-8 sequence character");
}
m_uskip--;
} else {
m_upos++;
m_uskip = get_utf8_size(m_curr);
m_uskip--;
}
}
void scanner::check_not_eof(char const * error_msg) {
@ -28,7 +59,7 @@ void scanner::check_not_eof(char const * error_msg) {
[[ noreturn ]] void scanner::throw_exception(char const * msg) {
unsigned line = m_sline;
unsigned pos = m_spos;
unsigned pos = m_upos;
while (curr() != EOF && !std::isspace(curr()))
next();
throw parser_exception(msg, m_stream_name.c_str(), line, pos);
@ -214,94 +245,132 @@ auto scanner::read_script_block() -> token_kind {
return token_kind::ScriptBlock;
}
static bool is_id_first(char c) { return std::isalpha(c) || c == '_'; }
static bool is_id_rest(char c) { return std::isalnum(c) || c == '_' || c == '\''; }
void scanner::move_back(std::streamoff offset) {
void scanner::move_back(unsigned offset, unsigned u_offset) {
lean_assert(m_uskip == 0);
if (offset != 0) {
if (curr() == EOF) {
m_curr = 0;
m_stream.clear();
m_spos--;
m_upos--;
offset--;
u_offset--;
}
if (offset != 0) {
m_stream.seekg(-static_cast<std::streamoff>(offset), std::ios_base::cur);
m_spos -= offset;
m_upos -= u_offset;
}
m_stream.seekg(offset, std::ios_base::cur);
m_spos += offset;
next();
}
}
bool scanner::is_next_id_rest() {
lean_assert(curr() != EOF);
char c = m_stream.get();
bool r = is_id_rest(c);
m_stream.unget();
return r;
void scanner::next_utf_core(char c, buffer<char> & cs) {
cs.push_back(c);
while (m_uskip > 0) {
next();
cs.push_back(curr());
}
}
void scanner::next_utf(buffer<char> & cs) {
next();
next_utf_core(curr(), cs);
}
static bool is_id_first(buffer<char> const & cs, unsigned i) {
return std::isalpha(cs[i]) || cs[i] == '_';
}
static bool is_id_rest(buffer<char> const & cs, unsigned i) {
return std::isalnum(cs[i]) || cs[i] == '_' || cs[i] == '\'';
}
auto scanner::read_key_cmd_id() -> token_kind {
static char const * error_msg = "unexpected token";
char c = curr();
unsigned num_cs = 1; // number of characters read
token_table const * it = find(*m_tokens, c);
token_info const * info = nullptr;
unsigned key_size = 0;
if (it) {
info = value_of(*it);
if (info)
key_size = 1;
}
std::string & id_part = m_buffer;
bool is_id = is_id_first(c);
unsigned id_size = 0;
if (is_id) {
m_name_val = name();
id_part.clear();
id_part.push_back(c);
id_size = 1;
}
while (it || is_id) {
next();
c = curr();
if (c != EOF)
num_cs++;
if (is_id) {
if (is_id_rest(c)) {
id_part.push_back(c);
id_size++;
} else if (c == '.' && is_next_id_rest()) {
m_name_val = name(m_name_val, id_part.c_str());
id_size++;
id_part.clear();
buffer<char> cs;
next_utf_core(curr(), cs);
unsigned num_utfs = 1;
unsigned id_sz = 0;
unsigned id_utf_sz = 0;
if (is_id_first(cs, 0)) {
id_sz = cs.size();
while (true) {
id_sz = cs.size();
id_utf_sz = num_utfs;
unsigned i = id_sz;
next_utf(cs);
num_utfs++;
if (is_id_rest(cs, i)) {
} else if (cs[i] == '.') {
next_utf(cs);
num_utfs++;
if (!is_id_rest(cs, i+1))
break;
} else {
is_id = false;
m_name_val = name(m_name_val, id_part.c_str());
if (!it)
return token_kind::Identifier;
break;
}
}
}
if (it) {
it = find(*it, c);
unsigned i = 0;
token_table const * it = find(*m_tokens, cs[i]);
token_info const * info = nullptr;
unsigned key_sz = 0;
unsigned key_utf_sz = 0;
unsigned aux_num_utfs = id_utf_sz;
if (it) {
if (aux_num_utfs == 0)
aux_num_utfs = 1;
info = value_of(*it);
if (info) {
lean_assert(m_uskip == 0);
key_sz = 1;
key_utf_sz = aux_num_utfs;
}
while (it) {
i++;
if (i == cs.size()) {
next_utf(cs);
num_utfs++;
aux_num_utfs = num_utfs;
}
it = find(*it, cs[i]);
if (it) {
if (auto new_info = value_of(*it)) {
info = new_info;
key_size = num_cs;
}
} else if (!is_id) {
if (id_size > key_size) {
move_back(static_cast<std::streamoff>(id_size) - static_cast<std::streamoff>(num_cs));
return token_kind::Identifier;
} else if (info) {
move_back(static_cast<std::streamoff>(key_size) - static_cast<std::streamoff>(num_cs));
m_token_info = info;
return info->is_command() ? token_kind::CommandKeyword : token_kind::Keyword;
lean_assert(m_uskip == 0);
info = new_info;
key_sz = i+1;
key_utf_sz = aux_num_utfs;
}
} else {
break;
}
}
}
throw_exception(error_msg);
if (id_sz == 0 && key_sz == 0)
throw_exception(error_msg);
if (id_sz > key_sz) {
move_back(cs.size() - id_sz, num_utfs - id_utf_sz);
m_name_val = name();
std::string & id_part = m_buffer;
id_part.clear();
for (unsigned i = 0; i < id_sz; i++) {
if (cs[i] == '.') {
m_name_val = name(m_name_val, id_part.c_str());
id_part.clear();
} else {
id_part.push_back(cs[i]);
}
}
m_name_val = name(m_name_val, id_part.c_str());
return token_kind::Identifier;
} else {
move_back(cs.size() - key_sz, num_utfs - key_utf_sz);
m_token_info = info;
return info->is_command() ? token_kind::CommandKeyword : token_kind::Keyword;
}
}
static name g_begin_script_tk("(*");
@ -312,7 +381,7 @@ auto scanner::scan(environment const & env) -> token_kind {
m_tokens = &get_token_table(env);
while (true) {
char c = curr();
m_pos = m_spos;
m_pos = m_upos;
m_line = m_sline;
switch (c) {
case ' ': case '\r': case '\t':
@ -352,7 +421,7 @@ auto scanner::scan(environment const & env) -> token_kind {
}
scanner::scanner(std::istream & strm, char const * strm_name):
m_tokens(nullptr), m_stream(strm), m_spos(0), m_sline(1), m_curr(0), m_pos(0), m_line(1),
m_tokens(nullptr), m_stream(strm), m_spos(0), m_upos(0), m_uskip(0), m_sline(1), m_curr(0), m_pos(0), m_line(1),
m_token_info(nullptr) {
m_stream_name = strm_name ? strm_name : "[unknown]";
next();

View file

@ -29,6 +29,8 @@ protected:
std::string m_stream_name;
int m_spos; // current position
int m_upos; // current position taking into account utf-8 encoding
int m_uskip; // hack for decoding utf-8, it marks how many units to skip
int m_sline; // current line
char m_curr; // current char;
@ -45,16 +47,19 @@ protected:
void next();
char curr() const { return m_curr; }
char curr_next() { char c = curr(); next(); return c; }
void new_line() { m_sline++; m_spos = 0; }
void new_line() { m_sline++; m_spos = 0; m_upos = 0; }
void update_line() { if (curr() == '\n') new_line(); }
void check_not_eof(char const * error_msg);
bool is_next_digit();
bool is_next_id_rest();
void move_back(std::streamoff offset);
void move_back(unsigned offset, unsigned u_offset);
bool consume(char const * str, char const * error_msg);
void read_single_line_comment();
void read_comment_block();
void read_until(char const * end_str, char const * error_msg);
unsigned get_utf8_size(unsigned char c);
void next_utf_core(char c, buffer<char> & cs);
void next_utf(buffer<char> & cs);
token_kind read_string();
token_kind read_number();