fix(frontends/lean/scanner): decode utf8

Signed-off-by: Leonardo de Moura <leonardo@microsoft.com>
2014-08-01 19:58:02 -07:00 · 2014-08-01 19:58:02 -07:00 · df57043861
commit df57043861
parent 4b604521a0
2 changed files with 143 additions and 69 deletions
--- a/src/frontends/lean/scanner.cpp
+++ b/src/frontends/lean/scanner.cpp
@ -11,6 +11,27 @@ Author: Leonardo de Moura
 #include "frontends/lean/parser_config.h"

 namespace lean {
+bool is_utf8_next(unsigned char c) { return (c & 0xC0) == 0x80; }
+
+unsigned scanner::get_utf8_size(unsigned char c) {
+    if ((c & 0x80) == 0)
+        return 1;
+    else if ((c & 0xE0) == 0xC0)
+        return 2;
+    else if ((c & 0xF0) == 0xE0)
+        return 3;
+    else if ((c & 0xF8) == 0xF0)
+        return 4;
+    else if ((c && 0xFC) == 0xF8)
+        return 5;
+    else if ((c && 0xFE) == 0xFC)
+        return 6;
+    else if (c == 0xFF)
+        return 1;
+    else
+        throw_exception("invalid utf-8 head character");
+}
+
 void scanner::set_line(unsigned p) {
    m_line  = p;
    m_sline = p;
@ -20,6 +41,16 @@ void scanner::next() {
    lean_assert(m_curr != EOF);
    m_curr = m_stream.get();
    m_spos++;
+    if (m_uskip > 0) {
+        if (!is_utf8_next(m_curr)) {
+            throw_exception("invalid utf-8 sequence character");
+        }
+        m_uskip--;
+    } else {
+        m_upos++;
+        m_uskip = get_utf8_size(m_curr);
+        m_uskip--;
+    }
 }

 void scanner::check_not_eof(char const * error_msg) {
@ -28,7 +59,7 @@ void scanner::check_not_eof(char const * error_msg) {

 [[ noreturn ]] void scanner::throw_exception(char const * msg) {
    unsigned line = m_sline;
-    unsigned pos  = m_spos;
+    unsigned pos  = m_upos;
    while (curr() != EOF && !std::isspace(curr()))
        next();
    throw parser_exception(msg, m_stream_name.c_str(), line, pos);
@ -214,94 +245,132 @@ auto scanner::read_script_block() -> token_kind {
    return token_kind::ScriptBlock;
 }

-static bool is_id_first(char c) { return std::isalpha(c) || c == '_'; }
-static bool is_id_rest(char c)  { return std::isalnum(c) || c == '_' || c == '\''; }
-
-void scanner::move_back(std::streamoff offset) {
+void scanner::move_back(unsigned offset, unsigned u_offset) {
+    lean_assert(m_uskip == 0);
    if (offset != 0) {
        if (curr() == EOF) {
            m_curr = 0;
            m_stream.clear();
            m_spos--;
+            m_upos--;
+            offset--;
+            u_offset--;
+        }
+        if (offset != 0) {
+            m_stream.seekg(-static_cast<std::streamoff>(offset), std::ios_base::cur);
+            m_spos -= offset;
+            m_upos -= u_offset;
        }
-        m_stream.seekg(offset, std::ios_base::cur);
-        m_spos += offset;
        next();
    }
 }

-bool scanner::is_next_id_rest() {
-    lean_assert(curr() != EOF);
-    char c = m_stream.get();
-    bool r = is_id_rest(c);
-    m_stream.unget();
-    return r;
+void scanner::next_utf_core(char c, buffer<char> & cs) {
+    cs.push_back(c);
+    while (m_uskip > 0) {
+        next();
+        cs.push_back(curr());
+    }
+}
+
+void scanner::next_utf(buffer<char> & cs) {
+    next();
+    next_utf_core(curr(), cs);
+}
+
+static bool is_id_first(buffer<char> const & cs, unsigned i) {
+    return std::isalpha(cs[i]) || cs[i] == '_';
+}
+
+static bool is_id_rest(buffer<char> const & cs, unsigned i)  {
+    return std::isalnum(cs[i]) || cs[i] == '_' || cs[i] == '\'';
 }

 auto scanner::read_key_cmd_id() -> token_kind {
    static char const * error_msg = "unexpected token";
-    char c = curr();
-    unsigned num_cs = 1; // number of characters read
-    token_table const * it    = find(*m_tokens, c);
-    token_info const * info = nullptr;
-    unsigned key_size       = 0;
-    if (it) {
-        info = value_of(*it);
-        if (info)
-            key_size = 1;
-    }
-    std::string & id_part = m_buffer;
-    bool is_id = is_id_first(c);
-    unsigned id_size = 0;
-    if (is_id) {
-        m_name_val = name();
-        id_part.clear();
-        id_part.push_back(c);
-        id_size = 1;
-    }
-
-    while (it || is_id) {
-        next();
-        c = curr();
-        if (c != EOF)
-            num_cs++;
-
-        if (is_id) {
-            if (is_id_rest(c)) {
-                id_part.push_back(c);
-                id_size++;
-            } else if (c == '.' && is_next_id_rest()) {
-                m_name_val = name(m_name_val, id_part.c_str());
-                id_size++;
-                id_part.clear();
+    buffer<char> cs;
+    next_utf_core(curr(), cs);
+    unsigned num_utfs  = 1;
+    unsigned id_sz     = 0;
+    unsigned id_utf_sz = 0;
+    if (is_id_first(cs, 0)) {
+        id_sz = cs.size();
+        while (true) {
+            id_sz     = cs.size();
+            id_utf_sz = num_utfs;
+            unsigned i = id_sz;
+            next_utf(cs);
+            num_utfs++;
+            if (is_id_rest(cs, i)) {
+            } else if (cs[i] == '.') {
+                next_utf(cs);
+                num_utfs++;
+                if (!is_id_rest(cs, i+1))
+                    break;
            } else {
-                is_id = false;
-                m_name_val = name(m_name_val, id_part.c_str());
-                if (!it)
-                    return token_kind::Identifier;
+                break;
            }
        }
+    }

-        if (it) {
-            it = find(*it, c);
+    unsigned i = 0;
+    token_table const * it  = find(*m_tokens, cs[i]);
+    token_info const * info = nullptr;
+    unsigned key_sz       = 0;
+    unsigned key_utf_sz   = 0;
+    unsigned aux_num_utfs = id_utf_sz;
+    if (it) {
+        if (aux_num_utfs == 0)
+            aux_num_utfs = 1;
+        info = value_of(*it);
+        if (info) {
+            lean_assert(m_uskip == 0);
+            key_sz     = 1;
+            key_utf_sz = aux_num_utfs;
+        }
+        while (it) {
+            i++;
+            if (i == cs.size()) {
+                next_utf(cs);
+                num_utfs++;
+                aux_num_utfs = num_utfs;
+            }
+            it = find(*it, cs[i]);
            if (it) {
                if (auto new_info = value_of(*it)) {
-                    info     = new_info;
-                    key_size = num_cs;
-                }
-            } else if (!is_id) {
-                if (id_size > key_size) {
-                    move_back(static_cast<std::streamoff>(id_size) - static_cast<std::streamoff>(num_cs));
-                    return token_kind::Identifier;
-                } else if (info) {
-                    move_back(static_cast<std::streamoff>(key_size) - static_cast<std::streamoff>(num_cs));
-                    m_token_info = info;
-                    return info->is_command() ? token_kind::CommandKeyword : token_kind::Keyword;
+                    lean_assert(m_uskip == 0);
+                    info       = new_info;
+                    key_sz     = i+1;
+                    key_utf_sz = aux_num_utfs;
                }
+            } else {
+                break;
            }
        }
    }
-    throw_exception(error_msg);
+
+    if (id_sz == 0 && key_sz == 0)
+        throw_exception(error_msg);
+    if (id_sz > key_sz) {
+        move_back(cs.size() - id_sz, num_utfs - id_utf_sz);
+        m_name_val = name();
+        std::string & id_part = m_buffer;
+        id_part.clear();
+        for (unsigned i = 0; i < id_sz; i++) {
+            if (cs[i] == '.') {
+                m_name_val = name(m_name_val, id_part.c_str());
+                id_part.clear();
+            } else {
+                id_part.push_back(cs[i]);
+            }
+        }
+        m_name_val = name(m_name_val, id_part.c_str());
+        return token_kind::Identifier;
+    } else {
+        move_back(cs.size() - key_sz, num_utfs - key_utf_sz);
+        m_token_info = info;
+        return info->is_command() ? token_kind::CommandKeyword : token_kind::Keyword;
+    }
 }

 static name g_begin_script_tk("(*");
@ -312,7 +381,7 @@ auto scanner::scan(environment const & env) -> token_kind {
    m_tokens = &get_token_table(env);
    while (true) {
        char c = curr();
-        m_pos  = m_spos;
+        m_pos  = m_upos;
        m_line = m_sline;
        switch (c) {
        case ' ': case '\r': case '\t':
@ -352,7 +421,7 @@ auto scanner::scan(environment const & env) -> token_kind {
 }

 scanner::scanner(std::istream & strm, char const * strm_name):
-    m_tokens(nullptr), m_stream(strm), m_spos(0), m_sline(1), m_curr(0), m_pos(0), m_line(1),
+    m_tokens(nullptr), m_stream(strm), m_spos(0), m_upos(0), m_uskip(0), m_sline(1), m_curr(0), m_pos(0), m_line(1),
    m_token_info(nullptr) {
    m_stream_name = strm_name ? strm_name : "[unknown]";
    next();
--- a/src/frontends/lean/scanner.h
+++ b/src/frontends/lean/scanner.h
@ -29,6 +29,8 @@ protected:
    std::string         m_stream_name;

    int                 m_spos;  // current position
+    int                 m_upos;  // current position taking into account utf-8 encoding
+    int                 m_uskip; // hack for decoding utf-8, it marks how many units to skip
    int                 m_sline; // current line
    char                m_curr;  // current char;

@ -45,16 +47,19 @@ protected:
    void next();
    char curr() const { return m_curr; }
    char curr_next() { char c = curr(); next(); return c; }
-    void new_line() { m_sline++; m_spos = 0; }
+    void new_line() { m_sline++; m_spos = 0; m_upos = 0; }
    void update_line() { if (curr() == '\n') new_line(); }
    void check_not_eof(char const * error_msg);
    bool is_next_digit();
    bool is_next_id_rest();
-    void move_back(std::streamoff offset);
+    void move_back(unsigned offset, unsigned u_offset);
    bool consume(char const * str, char const * error_msg);
    void read_single_line_comment();
    void read_comment_block();
    void read_until(char const * end_str, char const * error_msg);
+    unsigned get_utf8_size(unsigned char c);
+    void next_utf_core(char c, buffer<char> & cs);
+    void next_utf(buffer<char> & cs);

    token_kind read_string();
    token_kind read_number();