lean2/src/frontends/lean/scanner.cpp

492 lines
14 KiB
C++
Raw Normal View History

/*
Copyright (c) 2014 Microsoft Corporation. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
Author: Leonardo de Moura
*/
#include <cctype>
#include <string>
#include "util/exception.h"
#include "util/utf8.h"
#include "frontends/lean/scanner.h"
#include "frontends/lean/parser_config.h"
namespace lean {
unsigned scanner::get_utf8_size(unsigned char c) {
unsigned r = ::lean::get_utf8_size(c);
if (r == 0)
throw_exception("invalid utf-8 head character");
return r;
}
unsigned char to_uchar(char c) { return static_cast<unsigned char>(c); }
unsigned utf8_to_unicode(char const * begin, char const * end) {
unsigned result = 0;
if (begin == end)
return result;
auto it = begin;
unsigned c = to_uchar(*it);
++it;
if (c < 128)
return c;
unsigned mask = (1u << 6) -1;
unsigned hmask = mask;
unsigned shift = 0;
unsigned num_bits = 0;
while ((c & 0xC0) == 0xC0) {
c <<= 1;
c &= 0xff;
num_bits += 6;
hmask >>= 1;
shift++;
result <<= 6;
if (it == end)
return 0;
result |= *it & mask;
++it;
}
result |= ((c >> shift) & hmask) << num_bits;
return result;
}
bool is_greek_unicode(unsigned u) { return 0x391 <= u && u <= 0x3DD; }
bool is_letter_like_unicode(unsigned u) {
return
(0x3b1 <= u && u <= 0x3c9 && u != 0x3bb) || // Lower greek, but lambda
(0x391 <= u && u <= 0x3A9 && u != 0x3A0 && u != 0x3A3) || // Upper greek, but Pi and Sigma
(0x3ca <= u && u <= 0x3fb) || // Coptic letters
(0x1f00 <= u && u <= 0x1ffe) || // Polytonic Greek Extended Character Set
(0x2100 <= u && u <= 0x214f); // Letter like block
}
bool is_super_sub_script_alnum_unicode(unsigned u) {
return
(0x2070 <= u && u <= 0x2079) || // most (numeric) superscripts
(0x207f <= u && u <= 0x2089) || // n superscript and numberic subscripts
(0x2090 <= u && u <= 0x209c) || // letter-like subscripts
u == 0x00B2 || // 2 superscript
u == 0x00B3 || // 3 superscript
u == 0x00B9; // 1 superscript
}
void scanner::next() {
lean_assert(m_curr != EOF);
m_spos++;
refactor(frontends/lean/scanner): remove dependency to seekg and unget methods It is not safe to use seekg for textual files. Here is a fragment from a C++ manual: seekg() and seekp() This pair of functions serve respectively to change the position of stream pointers get and put. Both functions are overloaded with two different prototypes: seekg ( pos_type position ); seekp ( pos_type position ); Using this prototype the stream pointer is changed to an absolute position from the beginning of the file. The type required is the same as that returned by functions tellg and tellp. seekg ( off_type offset, seekdir direction ); seekp ( off_type offset, seekdir direction ); Using this prototype, an offset from a concrete point determined by parameter direction can be specified. It can be: ios::beg offset specified from the beginning of the stream ios::cur offset specified from the current position of the stream pointer ios::end offset specified from the end of the stream The values of both stream pointers get and put are counted in different ways for text files than for binary files, since in text mode files some modifications to the appearance of some special characters can occur. For that reason it is advisable to use only the first prototype of seekg and seekp with files opened in text mode and always use non-modified values returned by tellg or tellp. With binary files, you can freely use all the implementations for these functions. They should not have any unexpected behavior.
2014-09-18 22:24:48 +00:00
while (m_spos >= static_cast<int>(m_curr_line.size())) {
if (m_last_line) {
m_curr = EOF;
return;
} else {
m_curr_line.clear();
if (std::getline(m_stream, m_curr_line)) {
m_curr_line.push_back('\n');
m_sline++;
m_spos = 0;
m_upos = 0;
m_curr = m_curr_line[m_spos];
m_uskip = get_utf8_size(m_curr);
m_uskip--;
return;
} else {
m_last_line = true;
m_curr = EOF;
return;
}
}
}
m_curr = m_curr_line[m_spos];
if (m_uskip > 0) {
refactor(frontends/lean/scanner): remove dependency to seekg and unget methods It is not safe to use seekg for textual files. Here is a fragment from a C++ manual: seekg() and seekp() This pair of functions serve respectively to change the position of stream pointers get and put. Both functions are overloaded with two different prototypes: seekg ( pos_type position ); seekp ( pos_type position ); Using this prototype the stream pointer is changed to an absolute position from the beginning of the file. The type required is the same as that returned by functions tellg and tellp. seekg ( off_type offset, seekdir direction ); seekp ( off_type offset, seekdir direction ); Using this prototype, an offset from a concrete point determined by parameter direction can be specified. It can be: ios::beg offset specified from the beginning of the stream ios::cur offset specified from the current position of the stream pointer ios::end offset specified from the end of the stream The values of both stream pointers get and put are counted in different ways for text files than for binary files, since in text mode files some modifications to the appearance of some special characters can occur. For that reason it is advisable to use only the first prototype of seekg and seekp with files opened in text mode and always use non-modified values returned by tellg or tellp. With binary files, you can freely use all the implementations for these functions. They should not have any unexpected behavior.
2014-09-18 22:24:48 +00:00
if (!is_utf8_next(m_curr))
throw_exception("invalid utf-8 sequence character");
m_uskip--;
} else {
m_upos++;
m_uskip = get_utf8_size(m_curr);
m_uskip--;
}
}
void scanner::check_not_eof(char const * error_msg) {
if (curr() == EOF) throw_exception(error_msg);
}
[[ noreturn ]] void scanner::throw_exception(char const * msg) {
unsigned line = m_sline;
unsigned pos = m_upos;
while (curr() != EOF && !std::isspace(curr()))
next();
throw parser_exception(msg, m_stream_name.c_str(), line, pos);
}
static char const * g_end_error_str_msg = "unexpected end of string";
auto scanner::read_string() -> token_kind {
lean_assert(curr() == '\"');
next();
m_buffer.clear();
while (true) {
check_not_eof(g_end_error_str_msg);
char c = curr();
if (c == '\"') {
next();
return token_kind::String;
} else if (c == '\\') {
next();
check_not_eof(g_end_error_str_msg);
c = curr();
if (c != '\\' && c != '\"' && c != 'n')
throw_exception("invalid escape sequence");
if (c == 'n')
c = '\n';
}
m_buffer += c;
next();
}
}
auto scanner::read_quoted_symbol() -> token_kind {
lean_assert(curr() == '`');
next();
if (std::isdigit(curr()))
throw_exception("first character of a quoted symbols cannot be a digit");
m_buffer.clear();
while (true) {
check_not_eof("unexpected quoted identifier");
char c = curr();
next();
if (c == '`') {
m_name_val = name(m_buffer.c_str());
return token_kind::QuotedSymbol;
} else if (c != ' ' && c != '\"' && c != '\n' && c != '\t') {
m_buffer += c;
} else {
throw_exception("invalid quoted symbol, invalid character");
}
}
}
bool scanner::is_next_digit() {
lean_assert(curr() != EOF);
refactor(frontends/lean/scanner): remove dependency to seekg and unget methods It is not safe to use seekg for textual files. Here is a fragment from a C++ manual: seekg() and seekp() This pair of functions serve respectively to change the position of stream pointers get and put. Both functions are overloaded with two different prototypes: seekg ( pos_type position ); seekp ( pos_type position ); Using this prototype the stream pointer is changed to an absolute position from the beginning of the file. The type required is the same as that returned by functions tellg and tellp. seekg ( off_type offset, seekdir direction ); seekp ( off_type offset, seekdir direction ); Using this prototype, an offset from a concrete point determined by parameter direction can be specified. It can be: ios::beg offset specified from the beginning of the stream ios::cur offset specified from the current position of the stream pointer ios::end offset specified from the end of the stream The values of both stream pointers get and put are counted in different ways for text files than for binary files, since in text mode files some modifications to the appearance of some special characters can occur. For that reason it is advisable to use only the first prototype of seekg and seekp with files opened in text mode and always use non-modified values returned by tellg or tellp. With binary files, you can freely use all the implementations for these functions. They should not have any unexpected behavior.
2014-09-18 22:24:48 +00:00
if (m_spos + 1 < static_cast<int>(m_curr_line.size()))
return std::isdigit(m_curr_line[m_spos+1]);
else
return false;
}
auto scanner::read_number() -> token_kind {
lean_assert('0' <= curr() && curr() <= '9');
mpq q(1);
m_num_val = curr() - '0';
next();
bool is_decimal = false;
while (true) {
char c = curr();
if ('0' <= c && c <= '9') {
m_num_val = 10*m_num_val + (c - '0');
if (is_decimal)
q *= 10;
next();
} else if (c == '.') {
// Num. is not a decimal. It should be at least Num.0
if (is_next_digit()) {
if (is_decimal)
break;
is_decimal = true;
next();
} else {
break;
}
} else {
break;
}
}
if (is_decimal)
m_num_val /= q;
return is_decimal ? token_kind::Decimal : token_kind::Numeral;
}
void scanner::read_single_line_comment() {
while (true) {
if (curr() == '\n') {
next();
return;
} else if (curr() == EOF) {
return;
} else {
next();
}
}
}
void scanner::read_comment_block() {
unsigned nesting = 1;
while (true) {
char c = curr();
check_not_eof("unexpected end of comment block");
next();
if (c == '/') {
if (curr() == '-') {
next();
nesting++;
}
} else if (c == '-') {
if (curr() == '/') {
next();
nesting--;
if (nesting == 0)
return;
}
}
}
}
// Read until the end_str is found, store all characters (not including end_str) in m_buffer.
// Throw a parser exception error_msg if end of file is found before end_str.
void scanner::read_until(char const * end_str, char const * error_msg) {
lean_assert(end_str);
lean_assert(end_str[0]);
m_buffer.clear();
while (true) {
check_not_eof(error_msg);
char c = curr_next();
if (c == end_str[0]) {
m_aux_buffer.clear();
m_aux_buffer += c;
unsigned i = 1;
while (true) {
if (!end_str[i])
return;
check_not_eof(error_msg);
c = curr_next();
if (c != end_str[i]) {
m_buffer += m_aux_buffer;
break;
}
i++;
}
} else {
m_buffer += c;
}
}
}
auto scanner::read_script_block() -> token_kind {
read_until("*)", "unexpected end of script");
return token_kind::ScriptBlock;
}
void scanner::move_back(unsigned offset, unsigned u_offset) {
lean_assert(m_uskip == 0);
if (offset != 0) {
if (curr() == EOF) {
m_curr = 0;
m_spos--;
m_upos--;
offset--;
u_offset--;
}
if (offset != 0) {
m_spos -= offset;
m_upos -= u_offset;
}
next();
}
}
void scanner::next_utf_core(char c, buffer<char> & cs) {
cs.push_back(c);
while (m_uskip > 0) {
next();
cs.push_back(curr());
}
}
void scanner::next_utf(buffer<char> & cs) {
next();
next_utf_core(curr(), cs);
}
static bool is_id_first(buffer<char> const & cs, unsigned i) {
if (std::isalpha(cs[i]) || cs[i] == '_')
return true;
unsigned u = utf8_to_unicode(cs.begin() + i, cs.end());
return is_letter_like_unicode(u);
}
static bool is_id_rest(buffer<char> const & cs, unsigned i) {
if (std::isalnum(cs[i]) || cs[i] == '_' || cs[i] == '\'')
return true;
unsigned u = utf8_to_unicode(cs.begin() + i, cs.end());
return is_letter_like_unicode(u) || is_super_sub_script_alnum_unicode(u);
}
static char const * g_error_key_msg = "unexpected token";
auto scanner::read_key_cmd_id() -> token_kind {
buffer<char> cs;
next_utf_core(curr(), cs);
unsigned num_utfs = 1;
unsigned id_sz = 0;
unsigned id_utf_sz = 0;
if (is_id_first(cs, 0)) {
id_sz = cs.size();
while (true) {
id_sz = cs.size();
id_utf_sz = num_utfs;
unsigned i = id_sz;
next_utf(cs);
num_utfs++;
if (is_id_rest(cs, i)) {
} else if (cs[i] == '.') {
next_utf(cs);
num_utfs++;
if (!is_id_first(cs, i+1))
break;
} else {
break;
}
}
}
unsigned i = 0;
token_table const * it = m_tokens;
token_info const * info = nullptr;
unsigned key_sz = 0;
unsigned key_utf_sz = 0;
while (i < id_sz) {
it = find(*it, cs[i]);
i++;
if (!it)
break;
if (auto new_info = value_of(*it)) {
lean_assert(m_uskip == 0);
info = new_info;
key_sz = i;
key_utf_sz = id_utf_sz; // this is imprecise if key_sz < id_sz, but this case is irrelevant
}
}
while (it) {
if (i == cs.size()) {
next_utf(cs);
num_utfs++;
}
it = find(*it, cs[i]);
i++;
if (!it)
break;
if (auto new_info = value_of(*it)) {
lean_assert(m_uskip == 0);
info = new_info;
key_sz = i;
key_utf_sz = num_utfs;
lean_assert(key_sz > id_sz);
}
}
if (id_sz == 0 && key_sz == 0)
throw_exception(g_error_key_msg);
if (id_sz > key_sz) {
move_back(cs.size() - id_sz, num_utfs - id_utf_sz);
m_name_val = name();
std::string & id_part = m_buffer;
id_part.clear();
for (unsigned i = 0; i < id_sz; i++) {
if (cs[i] == '.') {
m_name_val = name(m_name_val, id_part.c_str());
id_part.clear();
} else {
id_part.push_back(cs[i]);
}
}
m_name_val = name(m_name_val, id_part.c_str());
return token_kind::Identifier;
} else {
move_back(cs.size() - key_sz, num_utfs - key_utf_sz);
m_token_info = *info;
return m_token_info.is_command() ? token_kind::CommandKeyword : token_kind::Keyword;
}
}
static name * g_begin_script_tk = nullptr;
static name * g_begin_comment_tk = nullptr;
static name * g_begin_comment_block_tk = nullptr;
void initialize_scanner() {
g_begin_script_tk = new name("(*");
g_begin_comment_tk = new name("--");
g_begin_comment_block_tk = new name("/-");
}
void finalize_scanner() {
delete g_begin_script_tk;
delete g_begin_comment_tk;
delete g_begin_comment_block_tk;
}
auto scanner::scan(environment const & env) -> token_kind {
m_tokens = &get_token_table(env);
while (true) {
char c = curr();
m_pos = m_upos;
m_line = m_sline;
switch (c) {
refactor(frontends/lean/scanner): remove dependency to seekg and unget methods It is not safe to use seekg for textual files. Here is a fragment from a C++ manual: seekg() and seekp() This pair of functions serve respectively to change the position of stream pointers get and put. Both functions are overloaded with two different prototypes: seekg ( pos_type position ); seekp ( pos_type position ); Using this prototype the stream pointer is changed to an absolute position from the beginning of the file. The type required is the same as that returned by functions tellg and tellp. seekg ( off_type offset, seekdir direction ); seekp ( off_type offset, seekdir direction ); Using this prototype, an offset from a concrete point determined by parameter direction can be specified. It can be: ios::beg offset specified from the beginning of the stream ios::cur offset specified from the current position of the stream pointer ios::end offset specified from the end of the stream The values of both stream pointers get and put are counted in different ways for text files than for binary files, since in text mode files some modifications to the appearance of some special characters can occur. For that reason it is advisable to use only the first prototype of seekg and seekp with files opened in text mode and always use non-modified values returned by tellg or tellp. With binary files, you can freely use all the implementations for these functions. They should not have any unexpected behavior.
2014-09-18 22:24:48 +00:00
case ' ': case '\r': case '\t': case '\n':
next();
break;
case '\"':
return read_string();
case '`':
return read_quoted_symbol();
case -1:
return token_kind::Eof;
default:
if (std::isdigit(c)) {
return read_number();
} else {
token_kind k = read_key_cmd_id();
if (k == token_kind::Keyword) {
// We treat '(--', '(*', '--' as "keywords.
name const & n = m_token_info.value();
if (n == *g_begin_comment_tk)
read_single_line_comment();
else if (n == *g_begin_comment_block_tk)
read_comment_block();
else if (n == *g_begin_script_tk)
return read_script_block();
else
return k;
} else {
return k;
}
}
}
}
}
scanner::scanner(std::istream & strm, char const * strm_name, unsigned line):
m_tokens(nullptr), m_stream(strm) {
m_stream_name = strm_name ? strm_name : "[unknown]";
refactor(frontends/lean/scanner): remove dependency to seekg and unget methods It is not safe to use seekg for textual files. Here is a fragment from a C++ manual: seekg() and seekp() This pair of functions serve respectively to change the position of stream pointers get and put. Both functions are overloaded with two different prototypes: seekg ( pos_type position ); seekp ( pos_type position ); Using this prototype the stream pointer is changed to an absolute position from the beginning of the file. The type required is the same as that returned by functions tellg and tellp. seekg ( off_type offset, seekdir direction ); seekp ( off_type offset, seekdir direction ); Using this prototype, an offset from a concrete point determined by parameter direction can be specified. It can be: ios::beg offset specified from the beginning of the stream ios::cur offset specified from the current position of the stream pointer ios::end offset specified from the end of the stream The values of both stream pointers get and put are counted in different ways for text files than for binary files, since in text mode files some modifications to the appearance of some special characters can occur. For that reason it is advisable to use only the first prototype of seekg and seekp with files opened in text mode and always use non-modified values returned by tellg or tellp. With binary files, you can freely use all the implementations for these functions. They should not have any unexpected behavior.
2014-09-18 22:24:48 +00:00
m_sline = line;
m_line = line;
m_spos = 0;
m_upos = 0;
if (std::getline(m_stream, m_curr_line)) {
m_last_line = false;
m_curr_line.push_back('\n');
m_curr = m_curr_line[m_spos];
m_uskip = get_utf8_size(m_curr);
m_uskip--;
} else {
m_last_line = true;
m_curr = EOF;
m_uskip = 0;
}
}
std::ostream & operator<<(std::ostream & out, scanner::token_kind k) {
out << static_cast<unsigned>(k);
return out;
}
}