feat(frontends/lean/scanner): allow letter-like unicode characters and sub/super-scripts in identifiers
Signed-off-by: Leonardo de Moura <leonardo@microsoft.com>
This commit is contained in:
parent
52e3505599
commit
a62e6f84e3
1 changed files with 54 additions and 2 deletions
|
@ -32,6 +32,52 @@ unsigned scanner::get_utf8_size(unsigned char c) {
|
||||||
throw_exception("invalid utf-8 head character");
|
throw_exception("invalid utf-8 head character");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unsigned char to_uchar(char c) { return static_cast<unsigned char>(c); }
|
||||||
|
|
||||||
|
unsigned utf8_to_unicode(char const * begin, char const * end) {
|
||||||
|
unsigned result = 0;
|
||||||
|
if (begin == end)
|
||||||
|
return result;
|
||||||
|
auto it = begin;
|
||||||
|
unsigned c = to_uchar(*it);
|
||||||
|
++it;
|
||||||
|
if (c < 128)
|
||||||
|
return c;
|
||||||
|
unsigned mask = (1u << 6) -1;
|
||||||
|
unsigned hmask = mask;
|
||||||
|
unsigned shift = 0;
|
||||||
|
unsigned num_bits = 0;
|
||||||
|
while ((c & 0xC0) == 0xC0) {
|
||||||
|
c <<= 1;
|
||||||
|
c &= 0xff;
|
||||||
|
num_bits += 6;
|
||||||
|
hmask >>= 1;
|
||||||
|
shift++;
|
||||||
|
result <<= 6;
|
||||||
|
if (it == end)
|
||||||
|
return 0;
|
||||||
|
result |= *it & mask;
|
||||||
|
++it;
|
||||||
|
}
|
||||||
|
result |= ((c >> shift) & hmask) << num_bits;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool is_greek_unicode(unsigned u) { return 0x391 <= u && u <= 0x3DD; }
|
||||||
|
bool is_letter_like_unicode(unsigned u) {
|
||||||
|
return
|
||||||
|
(0x3b1 <= u && u <= 0x3c9 && u != 0x3bb) || // Lower greek, but lambda
|
||||||
|
(0x3ca <= u && u <= 0x3fb) || // Coptic letters
|
||||||
|
(0x1f00 <= u && u <= 0x1ffe) || // Polytonic Greek Extended Character Set
|
||||||
|
(0x2100 <= u && u <= 0x214f); // Letter like block
|
||||||
|
}
|
||||||
|
bool is_super_sub_script_alnum_unicode(unsigned u) {
|
||||||
|
return
|
||||||
|
(0x2070 <= u && u <= 0x2079) ||
|
||||||
|
(0x207f <= u && u <= 0x2089) ||
|
||||||
|
(0x2090 <= u && u <= 0x209c);
|
||||||
|
}
|
||||||
|
|
||||||
void scanner::set_line(unsigned p) {
|
void scanner::set_line(unsigned p) {
|
||||||
m_line = p;
|
m_line = p;
|
||||||
m_sline = p;
|
m_sline = p;
|
||||||
|
@ -279,11 +325,17 @@ void scanner::next_utf(buffer<char> & cs) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool is_id_first(buffer<char> const & cs, unsigned i) {
|
static bool is_id_first(buffer<char> const & cs, unsigned i) {
|
||||||
return std::isalpha(cs[i]) || cs[i] == '_';
|
if (std::isalpha(cs[i]) || cs[i] == '_')
|
||||||
|
return true;
|
||||||
|
unsigned u = utf8_to_unicode(cs.begin() + i, cs.end());
|
||||||
|
return is_letter_like_unicode(u);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool is_id_rest(buffer<char> const & cs, unsigned i) {
|
static bool is_id_rest(buffer<char> const & cs, unsigned i) {
|
||||||
return std::isalnum(cs[i]) || cs[i] == '_' || cs[i] == '\'';
|
if (std::isalnum(cs[i]) || cs[i] == '_' || cs[i] == '\'')
|
||||||
|
return true;
|
||||||
|
unsigned u = utf8_to_unicode(cs.begin() + i, cs.end());
|
||||||
|
return is_letter_like_unicode(u) || is_super_sub_script_alnum_unicode(u);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto scanner::read_key_cmd_id() -> token_kind {
|
auto scanner::read_key_cmd_id() -> token_kind {
|
||||||
|
|
Loading…
Reference in a new issue