From d10d70423a797c24dd2d7cbb7a440a4ab3fe1260 Mon Sep 17 00:00:00 2001 From: Leonardo de Moura Date: Thu, 5 Jun 2014 18:55:36 -0700 Subject: [PATCH] feat(frontends/lean): add new scanner Signed-off-by: Leonardo de Moura --- src/CMakeLists.txt | 2 +- src/frontends/lean/CMakeLists.txt | 4 +- src/frontends/lean/scanner.cpp | 332 ++++++++++++++++++++++++ src/frontends/lean/scanner.h | 75 ++++++ src/tests/frontends/lean/CMakeLists.txt | 18 +- src/tests/frontends/lean/scanner.cpp | 187 ++++++++----- 6 files changed, 539 insertions(+), 79 deletions(-) create mode 100644 src/frontends/lean/scanner.cpp create mode 100644 src/frontends/lean/scanner.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a2be9640b..8e569ea91 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -260,7 +260,7 @@ add_subdirectory(tests/library) # add_subdirectory(tests/library/rewriter) # add_subdirectory(tests/library/tactic) # add_subdirectory(tests/library/elaborator) -# add_subdirectory(tests/frontends/lean) +add_subdirectory(tests/frontends/lean) # Include style check include(StyleCheck) diff --git a/src/frontends/lean/CMakeLists.txt b/src/frontends/lean/CMakeLists.txt index faaeca9d7..d2b53d6e7 100644 --- a/src/frontends/lean/CMakeLists.txt +++ b/src/frontends/lean/CMakeLists.txt @@ -1,2 +1,4 @@ -add_library(lean_frontend register_module.cpp token_set.cpp) +add_library(lean_frontend register_module.cpp token_set.cpp +scanner.cpp) + target_link_libraries(lean_frontend ${LEAN_LIBS}) diff --git a/src/frontends/lean/scanner.cpp b/src/frontends/lean/scanner.cpp new file mode 100644 index 000000000..9f9579b2e --- /dev/null +++ b/src/frontends/lean/scanner.cpp @@ -0,0 +1,332 @@ +/* +Copyright (c) 2014 Microsoft Corporation. All rights reserved. +Released under Apache 2.0 license as described in the file LICENSE. + +Author: Leonardo de Moura +*/ +#include +#include +#include "util/exception.h" +#include "frontends/lean/scanner.h" + +namespace lean { +void scanner::next() { + lean_assert(m_curr != EOF); + m_curr = m_stream.get(); + m_spos++; +} + +void scanner::check_not_eof(char const * error_msg) const { + if (curr() == EOF) throw_exception(error_msg); +} + +[[ noreturn ]] void scanner::throw_exception(char const * msg) const { + throw parser_exception(msg, m_stream_name.c_str(), m_sline, m_spos); +} + +auto scanner::read_string() -> token_kind { + static char const * end_error_msg = "unexpected end of string"; + lean_assert(curr() == '\"'); + next(); + m_buffer.clear(); + while (true) { + check_not_eof(end_error_msg); + update_line(); + char c = curr(); + if (c == '\"') { + next(); + return token_kind::String; + } else if (c == '\\') { + next(); + check_not_eof(end_error_msg); + c = curr(); + if (c != '\\' && c != '\"' && c != 'n') + throw_exception("invalid escape sequence"); + if (c == 'n') + c = '\n'; + } + m_buffer += c; + next(); + } +} + +bool scanner::is_next_digit() { + lean_assert(curr() != EOF); + char c = m_stream.get(); + bool r = std::isdigit(c); + m_stream.unget(); + return r; +} + +auto scanner::read_number() -> token_kind { + lean_assert('0' <= curr() && curr() <= '9'); + mpq q(1); + m_num_val = curr() - '0'; + next(); + bool is_decimal = false; + + while (true) { + char c = curr(); + if ('0' <= c && c <= '9') { + m_num_val = 10*m_num_val + (c - '0'); + if (is_decimal) + q *= 10; + next(); + } else if (c == '.') { + // Num. is not a decimal. It should be at least Num.0 + if (is_next_digit()) { + if (is_decimal) + break; + is_decimal = true; + next(); + } else { + break; + } + } else { + break; + } + } + if (is_decimal) + m_num_val /= q; + return is_decimal ? token_kind::Decimal : token_kind::Numeral; +} + +void scanner::read_single_line_comment() { + while (true) { + if (curr() == '\n') { + new_line(); + next(); + return; + } else if (curr() == EOF) { + return; + } else { + next(); + } + } +} + +// Try to consume str, return true if success. +// Throw a parser exception error_msg if end of file is found +bool scanner::consume(char const * str, char const * error_msg) { + if (curr() == str[0]) { + next(); + unsigned i = 1; + while (true) { + if (!str[i]) + return true; + check_not_eof(error_msg); + update_line(); + if (curr_next() != str[i]) + return false; + i++; + } + } else { + return false; + } +} + +static char const * g_begin_comment_block = "(--"; +static char const * g_end_comment_block = "--)"; + +void scanner::read_comment_block() { + static char const * end_error_msg = "unexpected end of comment block"; + unsigned nesting = 1; + while (true) { + if (consume(g_begin_comment_block, end_error_msg)) { + nesting++; + } + if (consume(g_end_comment_block, end_error_msg)) { + nesting--; + if (nesting == 0) + return; + } + check_not_eof(end_error_msg); + update_line(); + next(); + } +} + +// Read until the end_str is found, store all characters (not including end_str) in m_buffer. +// Throw a parser exception error_msg if end of file is found before end_str. +void scanner::read_until(char const * end_str, char const * error_msg) { + lean_assert(end_str); + lean_assert(end_str[0]); + m_buffer.clear(); + while (true) { + check_not_eof(error_msg); + update_line(); + char c = curr_next(); + if (c == end_str[0]) { + m_aux_buffer.clear(); + m_aux_buffer += c; + unsigned i = 1; + while (true) { + if (!end_str[i]) + return; + check_not_eof(error_msg); + update_line(); + c = curr_next(); + if (c != end_str[i]) { + m_buffer += m_aux_buffer; + break; + } + i++; + } + } else { + m_buffer += c; + } + } +} + +auto scanner::read_script_block() -> token_kind { + read_until("*)", "unexpected end of script"); + return token_kind::ScriptBlock; +} + +static bool is_id_first(char c) { return std::isalpha(c) || c == '_'; } +static bool is_id_rest(char c) { return std::isalnum(c) || c == '_' || c == '\''; } + +void scanner::move_back(std::streamoff offset) { + if (offset != 0) { + if (curr() == EOF) { + m_curr = 0; + m_stream.clear(); + m_spos--; + } + m_stream.seekg(offset, std::ios_base::cur); + m_spos += offset; + next(); + } +} + +bool scanner::is_next_id_rest() { + lean_assert(curr() != EOF); + char c = m_stream.get(); + bool r = is_id_rest(c); + m_stream.unget(); + return r; +} + +auto scanner::read_key_cmd_id() -> token_kind { + static char const * error_msg = "unexpected token"; + char c = curr(); + unsigned num_cs = 1; // number of characters read + token_set const * it = find(m_tokens, c); + token_info const * info = nullptr; + unsigned key_size = 0; + if (it) { + info = value_of(*it); + if (info) + key_size = 1; + } + + std::string & id_part = m_buffer; + bool is_id = is_id_first(c); + unsigned id_size = 0; + if (is_id) { + m_name_val = name(); + id_part.clear(); + id_part.push_back(c); + id_size = 1; + } + + while (it || is_id) { + next(); + c = curr(); + if (c != EOF) + num_cs++; + + if (is_id) { + if (is_id_rest(c)) { + id_part.push_back(c); + id_size++; + } else if (c == '.' && is_next_id_rest()) { + m_name_val = name(m_name_val, id_part.c_str()); + id_size++; + id_part.clear(); + } else { + is_id = false; + m_name_val = name(m_name_val, id_part.c_str()); + if (!it) + return token_kind::Identifier; + } + } + + if (it) { + it = find(*it, c); + if (it) { + if (auto new_info = value_of(*it)) { + info = new_info; + key_size = num_cs; + } + } else if (!is_id) { + if (id_size > key_size) { + move_back(static_cast(id_size) - static_cast(num_cs)); + return token_kind::Identifier; + } else if (info) { + move_back(static_cast(key_size) - static_cast(num_cs)); + m_token_info = info; + return info->is_command() ? token_kind::CommandKeyword : token_kind::Keyword; + } + } + } + } + throw_exception(error_msg); +} + +static name g_begin_script_tk("(*"); +static name g_begin_comment_tk("--"); +static name g_begin_comment_block_tk("(--"); + +auto scanner::scan() -> token_kind { + while (true) { + char c = curr(); + m_pos = m_spos; + m_line = m_sline; + switch (c) { + case ' ': case '\r': case '\t': + next(); + break; + case '\n': + next(); new_line(); + break; + case '\"': + return read_string(); + case -1: + return token_kind::Eof; + default: + if (std::isdigit(c)) { + return read_number(); + } else { + token_kind k = read_key_cmd_id(); + if (k == token_kind::Keyword) { + // We treat '(--', '(*', '--' as "keywords. + name const & n = m_token_info->value(); + if (n == g_begin_comment_tk) + read_single_line_comment(); + else if (n == g_begin_comment_block_tk) + read_comment_block(); + else if (n == g_begin_script_tk) + return read_script_block(); + else + return k; + } else { + return k; + } + } + } + } +} + +scanner::scanner(token_set const & tks, std::istream & strm, char const * strm_name): + m_tokens(tks), m_stream(strm), m_spos(0), m_sline(1), m_curr(0), m_pos(0), m_line(1), + m_token_info(nullptr) { + m_stream_name = strm_name ? strm_name : "[unknown]"; + next(); +} + +std::ostream & operator<<(std::ostream & out, scanner::token_kind k) { + out << static_cast(k); + return out; +} +} diff --git a/src/frontends/lean/scanner.h b/src/frontends/lean/scanner.h new file mode 100644 index 000000000..8f011bf83 --- /dev/null +++ b/src/frontends/lean/scanner.h @@ -0,0 +1,75 @@ +/* +Copyright (c) 2014 Microsoft Corporation. All rights reserved. +Released under Apache 2.0 license as described in the file LICENSE. + +Author: Leonardo de Moura +*/ +#include +#include +#include "util/name.h" +#include "util/numerics/mpq.h" +#include "frontends/lean/token_set.h" + +namespace lean { +/** + \brief Scanner. The behavior of the scanner is controlled using a token set. + + The scanner has builtin support for comments, script blocks, + identifiers, numerals, decimals, strings. Everything else is only + accepted if they are in the token set. +*/ +class scanner { +public: + enum class token_kind {Keyword, CommandKeyword, ScriptBlock, Identifier, Numeral, Decimal, String, Eof}; +protected: + token_set m_tokens; + std::istream & m_stream; + std::string m_stream_name; + + int m_spos; // current position + int m_sline; // current line + char m_curr; // current char; + + int m_pos; // start position of the token + int m_line; // line of the token + + name m_name_val; + token_info const * m_token_info; + mpq m_num_val; + std::string m_buffer; + std::string m_aux_buffer; + + [[ noreturn ]] void throw_exception(char const * msg) const; + void next(); + char curr() const { return m_curr; } + char curr_next() { char c = curr(); next(); return c; } + void new_line() { m_sline++; m_spos = 0; } + void update_line() { if (curr() == '\n') new_line(); } + void check_not_eof(char const * error_msg) const; + bool is_next_digit(); + bool is_next_id_rest(); + void move_back(std::streamoff offset); + bool consume(char const * str, char const * error_msg); + void read_single_line_comment(); + void read_comment_block(); + void read_until(char const * end_str, char const * error_msg); + + token_kind read_string(); + token_kind read_number(); + token_kind read_script_block(); + token_kind read_key_cmd_id(); + +public: + scanner(token_set const & tks, std::istream & strm, char const * strm_name = nullptr); + + int get_line() const { return m_line; } + int get_pos() const { return m_pos; } + token_kind scan(); + + mpq const & get_num_val() const { return m_num_val; } + name const & get_name_val() const { return m_name_val; } + std::string const & get_str_val() const { return m_buffer; } + token_info const & get_token_info() const { lean_assert(m_token_info); return *m_token_info; } +}; +std::ostream & operator<<(std::ostream & out, scanner::token_kind k); +} diff --git a/src/tests/frontends/lean/CMakeLists.txt b/src/tests/frontends/lean/CMakeLists.txt index b964e9814..94305a8b7 100644 --- a/src/tests/frontends/lean/CMakeLists.txt +++ b/src/tests/frontends/lean/CMakeLists.txt @@ -1,15 +1,9 @@ -add_executable(lean_frontend_tst frontend.cpp) -target_link_libraries(lean_frontend_tst ${EXTRA_LIBS}) -add_test(lean_frontend ${CMAKE_CURRENT_BINARY_DIR}/lean_frontend_tst) -set_tests_properties(lean_frontend PROPERTIES ENVIRONMENT "LEAN_PATH=${LEAN_BINARY_DIR}/shell") add_executable(lean_scanner scanner.cpp) target_link_libraries(lean_scanner ${EXTRA_LIBS}) add_test(lean_scanner ${CMAKE_CURRENT_BINARY_DIR}/lean_scanner) -add_executable(lean_parser parser.cpp) -target_link_libraries(lean_parser ${EXTRA_LIBS}) -add_test(lean_parser ${CMAKE_CURRENT_BINARY_DIR}/lean_parser) -set_tests_properties(lean_parser PROPERTIES ENVIRONMENT "LEAN_PATH=${LEAN_BINARY_DIR}/shell") -add_executable(lean_pp pp.cpp) -target_link_libraries(lean_pp ${EXTRA_LIBS}) -add_test(lean_pp ${CMAKE_CURRENT_BINARY_DIR}/lean_pp) -set_tests_properties(lean_pp PROPERTIES ENVIRONMENT "LEAN_PATH=${LEAN_BINARY_DIR}/shell") +# add_executable(lean_parser parser.cpp) +# target_link_libraries(lean_parser ${EXTRA_LIBS}) +# add_test(lean_parser ${CMAKE_CURRENT_BINARY_DIR}/lean_parser) +# # add_executable(lean_pp pp.cpp) +# target_link_libraries(lean_pp ${EXTRA_LIBS}) +# add_test(lean_pp ${CMAKE_CURRENT_BINARY_DIR}/lean_pp) diff --git a/src/tests/frontends/lean/scanner.cpp b/src/tests/frontends/lean/scanner.cpp index f60b2d62f..d43975047 100644 --- a/src/tests/frontends/lean/scanner.cpp +++ b/src/tests/frontends/lean/scanner.cpp @@ -6,33 +6,44 @@ Author: Leonardo de Moura */ #include #include "util/test.h" -#include "util/exception.h" #include "util/escaped.h" +#include "util/exception.h" #include "frontends/lean/scanner.h" using namespace lean; -#define st scanner::token +#define tk scanner::token_kind -static void scan(char const * str, list const & cmds = list()) { +static void scan(char const * str, token_set set = mk_default_token_set()) { std::istringstream in(str); - scanner s(in, "[string]"); - for (name const & n : cmds) s.add_command_keyword(n); + scanner s(set, in, "[string]"); while (true) { - st t = s.scan(); - if (t == st::Eof) + tk k = s.scan(); + if (k == tk::Eof) break; - std::cout << t; - if (t == st::Id || t == st::CommandId) - std::cout << "[" << s.get_name_val() << "]"; - else if (t == st::IntVal || t == st::DecimalVal) - std::cout << "[" << s.get_num_val() << "]"; - else if (t == st::StringVal) + if (k == tk::Identifier) + std::cout << "id[" << s.get_name_val() << "]"; + else if (k == tk::CommandKeyword) + std::cout << "cmd[" << s.get_token_info().value() << "]"; + else if (k == tk::Keyword) + std::cout << "tk[" << s.get_token_info().value() << "]"; + else if (k == tk::Decimal || k == tk::Numeral) + std::cout << "n[" << s.get_num_val() << "]"; + else if (k == tk::String) std::cout << "[\"" << escaped(s.get_str_val().c_str()) << "\"]"; - std::cout << " "; + std::cout << ":" << s.get_pos() << ":" << s.get_line() << " "; } std::cout << "\n"; } +static void scan_success(char const * str, token_set set = mk_default_token_set()) { + try { + scan(str, set); + } catch (exception & ex) { + std::cout << "ERROR: " << ex.what() << "\n"; + lean_unreachable(); + } +} + static void scan_error(char const * str) { try { scan(str); @@ -42,65 +53,114 @@ static void scan_error(char const * str) { } } -static void check(char const * str, std::initializer_list const & l, list const & cmds = list()) { - auto it = l.begin(); - std::istringstream in(str); - scanner s(in, "[string]"); - for (name const & n : cmds) s.add_command_keyword(n); - while (true) { - st t = s.scan(); - if (t == st::Eof) { - lean_assert(it == l.end()); - return; +static void check(char const * str, std::initializer_list const & l, + token_set set = mk_default_token_set()) { + try { + auto it = l.begin(); + std::istringstream in(str); + scanner s(set, in, "[string]"); + while (true) { + tk k = s.scan(); + if (k == tk::Eof) { + lean_assert(it == l.end()); + return; + } + lean_assert(it != l.end()); + lean_assert_eq(k, *it); + ++it; } - lean_assert(it != l.end()); - lean_assert(t == *it); - ++it; + } catch (exception & ex) { + std::cout << "ERROR: " << ex.what() << "\n"; + lean_unreachable(); } } -static void check_name(char const * str, name const & expected) { +static void check_name(char const * str, name const & expected, token_set set = mk_default_token_set()) { std::istringstream in(str); - scanner s(in, "[string]"); - st t = s.scan(); - lean_assert(t == st::Id); + scanner s(set, in, "[string]"); + tk k = s.scan(); + lean_assert(k == tk::Identifier); lean_assert(s.get_name_val() == expected); - lean_assert(s.scan() == st::Eof); + lean_assert(s.scan() == tk::Eof); +} + +static void check_keyword(char const * str, name const & expected, token_set set = mk_default_token_set()) { + std::istringstream in(str); + scanner s(set, in, "[string]"); + tk k = s.scan(); + lean_assert(k == tk::Keyword); + lean_assert(s.get_token_info().value() == expected); + lean_assert(s.scan() == tk::Eof); } static void tst1() { - scan("fun(x: forall A : Type, A -> A), x+1 = 2.0 λ"); + token_set s = mk_default_token_set(); + s = add_token(s, "+", "plus"); + s = add_token(s, "=", "eq"); + scan_success("a..a"); + check("a..a", {tk::Identifier, tk::Keyword, tk::Keyword, tk::Identifier}); + check("Type.{0}", {tk::Keyword, tk::Keyword, tk::Numeral, tk::Keyword}); + s = add_token(s, "ab+cde", "weird1"); + s = add_token(s, "+cd", "weird2"); + scan_success("ab+cd", s); + check("ab+cd", {tk::Identifier, tk::Keyword}, s); + scan_success("ab+cde", s); + check("ab+cde", {tk::Keyword}, s); + scan_success("Type.{0}"); + scan_success("0.a a"); + scan_success("0."); + scan_success("0.."); + scan_success("0..1"); + scan_success("fun"); + scan_success(".."); + scan_success("...."); + scan_success("....\n.."); + scan_success("a", s); + scan_success("a. b.c.."); + scan_success(".. .."); + scan_success("....\n.."); + scan_success("fun(x: forall A : Type, A -> A), x+1 = 2.0 λvalue.foo. . . a", s); } static void tst2() { - scan("x::name"); - scan("x::10::foo"); - check("x::name", {st::Id}); - check("fun (x : Bool), x", {st::Lambda, st::LeftParen, st::Id, st::Colon, st::Id, st::RightParen, st::Comma, st::Id}); - check("+++", {st::Id}); - check("x+y", {st::Id, st::Id, st::Id}); - check("-- testing", {}); - check(" 2.31 ", {st::DecimalVal}); - check(" 333 22", {st::IntVal, st::IntVal}); - check("Int -> Int", {st::Id, st::Arrow, st::Id}); - check("Int -+-> Int", {st::Id, st::Id, st::Id}); - check("x := 10", {st::Id, st::Assign, st::IntVal}); - check("(x+1):Int", {st::LeftParen, st::Id, st::Id, st::IntVal, st::RightParen, st::Colon, st::Id}); - check("{x}", {st::LeftCurlyBracket, st::Id, st::RightCurlyBracket}); - check("\u03BB \u2200 \u2192", {st::Lambda, st::Pi, st::Arrow}); - scan("++\u2295++x\u2296\u2296"); - check("++\u2295++x\u2296\u2296", {st::Id, st::Id, st::Id, st::Id, st::Id}); - scan("x10 ... == (* print('hello') *) have by"); + scan("x.name"); + scan("x.foo"); + check("x.name", {tk::Identifier}); + check("fun (x : Bool), x", {tk::Keyword, tk::Keyword, tk::Identifier, tk::Keyword, tk::Identifier, + tk::Keyword, tk::Keyword, tk::Identifier}); check_name("x10", name("x10")); - check_name("x::10", name(name("x"), 10)); - check_name("x::10::bla::0", name(name(name(name("x"), 10), "bla"), 0u)); - check("0::1", {st::IntVal, st::Colon, st::Colon, st::IntVal}); - check_name("\u2296\u2296", name("\u2296\u2296")); - scan_error("x::1000000000000000000"); - scan("Theorem a : Bool Axiom b : Int", list({"Theorem", "Axiom"})); - check("Theorem a : Bool Axiom b : Int", {st::CommandId, st::Id, st::Colon, st::Id, st::CommandId, st::Id, st::Colon, st::Id}, list({"Theorem", "Axiom"})); - scan("foo \"tst\\\"\" : Int"); - check("foo \"tst\\\"\" : Int", {st::Id, st::StringVal, st::Colon, st::Id}); + // check_name("x.10", name(name("x"), 10)); + check_name("x.bla", name({"x", "bla"})); + + scan_error("+++"); + token_set s = mk_default_token_set(); + s = add_token(s, "+++", "tplus"); + check_keyword("+++", "tplus", s); + s = add_token(s, "+", "plus"); + check("x+y", {tk::Identifier, tk::Keyword, tk::Identifier}, s); + check("-- testing", {}); + check("(-- testing --)", {}); + check("(-- (-- testing\n --) --)", {}); + check(" 2.31 ", {tk::Decimal}); + check("2.31", {tk::Decimal}); + check(" 333 22", {tk::Numeral, tk::Numeral}); + check("int -> int", {tk::Identifier, tk::Keyword, tk::Identifier}); + check("int->int", {tk::Identifier, tk::Keyword, tk::Identifier}); + check_keyword("->", "->"); + s = add_token(s, "-+->", "arrow"); + check("Int -+-> Int", {tk::Identifier, tk::Keyword, tk::Identifier}, s); + check("x := 10", {tk::Identifier, tk::Keyword, tk::Numeral}); + check("{x}", {tk::Keyword, tk::Identifier, tk::Keyword}); + check("\u03BB \u2200 \u2192", {tk::Keyword, tk::Keyword, tk::Keyword}); + check_keyword("\u03BB", "fun"); + scan("x10 ... (* print('hello') *) have by"); + scan("0..1"); + check("0..1", {tk::Numeral, tk::Keyword, tk::Keyword, tk::Numeral}); + scan("theorem a : Bool axiom b : Int"); + check("theorem a : Bool axiom b : Int", {tk::CommandKeyword, tk::Identifier, tk::Keyword, tk::Identifier, + tk::CommandKeyword, tk::Identifier, tk::Keyword, tk::Identifier}); + scan("foo \"ttk\\\"\" : Int"); + check("foo \"ttk\\\"\" : Int", {tk::Identifier, tk::String, tk::Keyword, tk::Identifier}); scan_error("\"foo"); scan("2.13 1.2 0.5"); } @@ -109,12 +169,9 @@ static void tst3() { scan_error("\"\\"); scan_error("\"\\a"); scan("\"\naaa\""); - scan_error("foo::0a::1"); + scan_error("foo.+ 01"); scan("10.0."); - scan("{ } . forall exists let in \u2200 \u2203 := _"); - std::ostringstream out; - out << scanner::token::Eof; - lean_assert_eq(out.str(), "EOF"); + scan("{ } . forall exists let in \u2200 := _"); } int main() {