Move sexpr/format/options to util/sexpr (reason: circular dependency between util and numerics lib), now numerics depend on util, and sexpr depends on numerics and util. Add scanner to frontend.

Signed-off-by: Leonardo de Moura <leonardo@microsoft.com>
2013-08-13 03:40:51 -07:00 · 2013-08-13 03:40:51 -07:00 · 2ea7479ee9
commit 2ea7479ee9
parent ac38937241
19 changed files with 299 additions and 97 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -61,6 +61,7 @@ endif()

 include_directories(${LEAN_SOURCE_DIR}/util)
 include_directories(${LEAN_SOURCE_DIR}/util/numerics)
+include_directories(${LEAN_SOURCE_DIR}/util/sexpr)
 include_directories(${LEAN_SOURCE_DIR}/interval)
 include_directories(${LEAN_SOURCE_DIR}/kernel)
 include_directories(${LEAN_SOURCE_DIR}/kernel/arith)
@ -71,6 +72,8 @@ add_subdirectory(util)
 set(LEAN_LIBS ${LEAN_LIBS} util)
 add_subdirectory(util/numerics)
 set(LEAN_LIBS ${LEAN_LIBS} numerics)
+add_subdirectory(util/sexpr)
+set(LEAN_LIBS ${LEAN_LIBS} sexpr)
 add_subdirectory(interval)
 set(LEAN_LIBS ${LEAN_LIBS} interval)
 add_subdirectory(kernel)
--- a/src/frontend/CMakeLists.txt
+++ b/src/frontend/CMakeLists.txt
@ -1,2 +1,2 @@
-add_library(frontend frontend.cpp operator_info)
+add_library(frontend frontend.cpp operator_info.cpp scanner.cpp)
 target_link_libraries(frontend ${LEAN_LIBS})
--- a/src/frontend/scanner.cpp
+++ b/src/frontend/scanner.cpp
@ -7,77 +7,280 @@ Author: Leonardo de Moura
 #include <cstdio>
 #include "scanner.h"
 #include "debug.h"
+#include "exception.h"

 namespace lean {

-scanner::scanner(std::istream& stream, bool interactive):
-    m_interactive(interactive),
+static name g_lambda_unicode("\u03BB");
+static name g_pi_unicode("\u03A0");
+static name g_arrow_unicode("\u2192");
+static char g_normalized[255];
+static name g_lambda_name("fun");
+static name g_pi_name("pi");
+static name g_arrow_name("->");
+
+class init_normalized_table {
+public:
+    init_normalized_table() {
+        // by default all characters are in group c,
+        for (int i = 0; i <= 255; i++)
+            g_normalized[i] = 'c';
+
+        // digits normalize to '0'
+        for (int i = '0'; i <= '9'; i++)
+            g_normalized[i] = '0';
+
+        // characters that can be used to create ids of group a
+        for (int i = 'a'; i <= 'z'; i++)
+            g_normalized[i] = 'a';
+        for (int i = 'A'; i <= 'Z'; i++)
+            g_normalized[i] = 'a';
+        g_normalized['_'] = 'a';
+
+        // characters that can be used to create ids of group b
+        for (unsigned char b : {'=', '<', '>', '@', '^', '|', '&', '~', '+', '-', '*', '/', '$', '%', '?', ';'})
+            g_normalized[b] = 'b';
+
+        // punctuation
+        g_normalized['('] = '(';
+        g_normalized[')'] = ')';
+        g_normalized['{'] = '{';
+        g_normalized['}'] = '}';
+        g_normalized[':'] = ':';
+        g_normalized['.'] = '.';
+        g_normalized[','] = ',';
+
+        // spaces
+        g_normalized[' ']  = ' ';
+        g_normalized['\t'] = ' ';
+        g_normalized['\r'] = ' ';
+
+        // new line
+        g_normalized['\n'] = '\n';
+
+        // eof
+        g_normalized[255] = -1;
+    }
+};
+
+static init_normalized_table g_init_normalized_table;
+
+char normalize(char c) {
+    return g_normalized[static_cast<unsigned char>(c)];
+}
+
+scanner::scanner(std::istream& stream):
    m_spos(0),
    m_curr(0),
    m_line(1),
    m_pos(0),
-    m_bpos(0),
-    m_bend(0),
-    m_stream(stream),
-    m_cache_input(false) {
+    m_stream(stream) {
+    next();
 }

 scanner::~scanner() {
 }

 void scanner::next() {
-    if (m_cache_input)
-        m_cache.push_back(m_curr);
    lean_assert(m_curr != EOF);
-    if (m_interactive) {
    m_curr = m_stream.get();
-    }
-    else if (m_bpos < m_bend) {
-        m_curr = m_buffer[m_bpos];
-        m_bpos++;
-    }
-    else {
-        m_stream.read(m_buffer, scanner_buffer_size);
-        m_bend = static_cast<unsigned>(m_stream.gcount());
-        m_bpos = 0;
-        if (m_bpos == m_bend) {
-            m_curr = EOF;
-        }
-        else {
-            m_curr = m_buffer[m_bpos];
-            m_bpos++;
-        }
-    }
    m_spos++;
 }

-void scanner::start_caching() {
-    m_cache_input = true;
-    m_cache.clear();
+bool scanner::check_next(char c) {
+    lean_assert(m_curr != EOF);
+    bool r = m_stream.get() == c;
+    m_stream.unget();
+    return r;
 }

-void scanner::stop_caching() {
-    m_cache_input = false;
+void scanner::read_comment() {
+    lean_assert(curr() == '*');
+    next();
+    int nest = 1;
+    while (true) {
+        if (curr() == '*') {
+            next();
+            if (curr() == ')') {
+                next();
+                nest--;
+                if (nest == 0)
+                    return;
+            }
+        } else if (curr() == '(') {
+            next();
+            if (curr() == '*') {
+                next();
+                nest++;
+            }
+        } else if (curr() == '\n') {
+            new_line();
+            next();
+        } else if (curr() == EOF) {
+            throw exception("unexpected end of comment");
+        } else {
+            next();
+        }
+    }
 }

-unsigned scanner::cache_size() const {
-    return m_cache.size();
+bool scanner::is_command(name const & n) const {
+    return false;
 }

-void scanner::reset_cache() {
-    m_cache.clear();
+scanner::token scanner::read_a_symbol() {
+    lean_assert(normalize(curr()) == 'a');
+    m_buffer.clear();
+    m_buffer += curr();
+    m_name_val = name();
+    next();
+    while (true) {
+        if (normalize(curr()) == 'a') {
+            m_buffer += curr();
+            next();
+        } else if (curr() == ':' && check_next(':')) {
+            next();
+            lean_assert(curr() == ':');
+            next();
+            if (m_name_val.is_anonymous())
+                m_name_val = name(m_buffer.c_str());
+            else
+                m_name_val = name(m_name_val, m_buffer.c_str());
+        } else {
+            if (m_name_val.is_anonymous())
+                m_name_val = name(m_buffer.c_str());
+            else
+                m_name_val = name(m_name_val, m_buffer.c_str());
+            if (m_name_val == g_lambda_name)
+                return token::Lambda;
+            else if (m_name_val == g_pi_name)
+                return token::Pi;
+            else
+                return is_command(m_name_val) ? token::CommandId : token::Id;
+        }
+    }
 }

-char const * scanner::cached_str(unsigned begin, unsigned end) {
-    m_cache_result.clear();
-    while (isspace(m_cache[begin]) && begin < end)
-        begin++;
-    while (begin < end && isspace(m_cache[end-1]))
-        end--;
-    for (unsigned i = begin; i < end; i++)
-        m_cache_result.push_back(m_cache[i]);
-    m_cache_result.push_back(0);
-    return m_cache_result.data();
+scanner::token scanner::read_b_symbol() {
+    lean_assert(normalize(curr()) == 'b');
+    m_buffer.clear();
+    m_buffer += curr();
+    next();
+    while (true) {
+        if (normalize(curr()) == 'b') {
+            m_buffer += curr();
+            next();
+        } else {
+            m_name_val = name(m_buffer.c_str());
+            if (m_name_val == g_arrow_name)
+                return token::Arrow;
+            else
+                return token::Id;
+        }
+    }
 }

+scanner::token scanner::read_c_symbol() {
+    lean_assert(normalize(curr()) == 'c');
+    m_buffer.clear();
+    m_buffer += curr();
+    next();
+    while (true) {
+        if (normalize(curr()) == 'c') {
+            m_buffer += curr();
+            next();
+        } else {
+            m_name_val = name(m_buffer.c_str());
+            if (m_name_val == g_arrow_unicode)
+                return token::Arrow;
+            else if (m_name_val == g_lambda_unicode)
+                return token::Lambda;
+            else if (m_name_val == g_pi_unicode)
+                return token::Pi;
+            else
+                return token::Id;
+        }
+    }
+}
+
+scanner::token scanner::read_number() {
+    lean_assert('0' <= curr() && curr() <= '9');
+    mpq q(1);
+    m_num_val = curr() - '0';
+    next();
+    bool is_decimal = false;
+
+    while (true) {
+        char c = curr();
+        if ('0' <= c && c <= '9') {
+            m_num_val = 10*m_num_val + (c - '0');
+            if (is_decimal)
+                q *= 10;
+            next();
+        } else if (c == '.') {
+            if (is_decimal)
+                break;
+            is_decimal = true;
+            next();
+        } else {
+            break;
+        }
+    }
+    if (is_decimal)
+        m_num_val /= q;
+    return is_decimal ? token::Decimal : token::Int;
+}
+
+scanner::token scanner::scan() {
+    while (true) {
+        char c = curr();
+        m_pos = m_spos;
+        switch (normalize(c)) {
+        case ' ':  next(); break;
+        case '\n': next(); new_line(); break;
+        case ':':  next(); return token::Colon;
+        case ',':  next(); return token::Comma;
+        case '.':  next(); return token::Period;
+        case '(':
+            next();
+            if (curr() == '*') {
+                read_comment();
+                break;
+            } else {
+                return token::LeftParen;
+            }
+        case ')': next(); return token::RightParen;
+        case '{': next(); return token::LeftCurlyBracket;
+        case '}': next(); return token::RightCurlyBracket;
+        case 'a': return read_a_symbol();
+        case 'b': return read_b_symbol();
+        case 'c': return read_c_symbol();
+        case '0': return read_number();
+        case -1:  return token::Eof;
+        default: lean_unreachable();
+        }
+    }
+}
+std::ostream & operator<<(std::ostream & out, scanner::token const & t) {
+    switch (t) {
+    case scanner::token::LeftParen:         out << "("; break;
+    case scanner::token::RightParen:        out << ")"; break;
+    case scanner::token::LeftCurlyBracket:  out << "{"; break;
+    case scanner::token::RightCurlyBracket: out << "}"; break;
+    case scanner::token::Colon:             out << ":"; break;
+    case scanner::token::Comma:             out << ","; break;
+    case scanner::token::Period:            out << "."; break;
+    case scanner::token::Lambda:            out << g_lambda_unicode; break;
+    case scanner::token::Pi:                out << g_pi_unicode; break;
+    case scanner::token::Arrow:             out << g_arrow_unicode; break;
+    case scanner::token::Id:                out << "Id"; break;
+    case scanner::token::CommandId:         out << "CId"; break;
+    case scanner::token::Int:               out << "Int"; break;
+    case scanner::token::Decimal:           out << "Dec"; break;
+    case scanner::token::Eq:                out << "="; break;
+    case scanner::token::Assign:            out << ":="; break;
+    case scanner::token::Eof:               out << "EOF"; break;
+    }
+    return out;
+}
 }
--- a/src/frontend/scanner.h
+++ b/src/frontend/scanner.h
@ -7,49 +7,52 @@ Author: Leonardo de Moura
 #pragma once
 #include <iostream>
 #include <vector>
+#include "mpq.h"
+#include "name.h"

 namespace lean {
-constexpr unsigned scanner_buffer_size = 1024;
-
 /**
-    \brief Base class for all scanners in Lean
-
-    It provides basic support for reading streams, buffering, and caching the contents of the input stream.
+    \brief Lean scanner.
 */
 class scanner {
+public:
+    enum class token {
+        LeftParen, RightParen, LeftCurlyBracket, RightCurlyBracket, Colon, Comma, Period, Lambda, Pi, Arrow,
+        Id, CommandId, Int, Decimal, Eq, Assign, Eof
+    };
 protected:
-    bool               m_interactive;
    int                m_spos; // position in the current line of the stream
    char               m_curr;  // current char;

    int                m_line;  // line
    int                m_pos;   // start position of the token
+    std::istream &     m_stream;

-    char               m_buffer[scanner_buffer_size];
-    unsigned           m_bpos;
-    unsigned           m_bend;
-    std::istream&      m_stream;
-
-    bool               m_cache_input;
-    std::vector<char>  m_cache;
-    std::vector<char>  m_cache_result;
+    mpq                m_num_val;
+    name               m_name_val;
+    std::string        m_buffer;

    char curr() const { return m_curr; }
    void new_line() { m_line++; m_spos = 0; }
    void next();
+    bool check_next(char c);
+    void  read_comment();
+    token read_a_symbol();
+    token read_b_symbol();
+    token read_c_symbol();
+    token read_number();
+    bool is_command(name const & n) const;

 public:
-    scanner(std::istream& stream, bool interactive = false);
+    scanner(std::istream& stream);
    ~scanner();

    int get_line() const { return m_line; }
    int get_pos() const { return m_pos; }
+    token scan();

-    void start_caching();
-    void stop_caching();
-    unsigned cache_size() const;
-    void reset_cache();
-    char const * cached_str(unsigned begin, unsigned end);
+    name const & get_name_val() const { return m_name_val; }
+    mpq const & get_num_val() const { return m_num_val; }
 };
-
+std::ostream & operator<<(std::ostream & out, scanner::token const & t);
 }
--- a/src/kernel/level.h
+++ b/src/kernel/level.h
@ -65,5 +65,5 @@ inline bool is_max (level const & l) { return kind(l) == level_kind::Max;  }
 inline level const * max_begin_levels(level const & l) { return &max_level(l, 0); }
 inline level const * max_end_levels(level const & l)   { return max_begin_levels(l) + max_size(l); }

-format pp(level const & l, char const * sep = default_name_separator);
+format pp(level const & l, char const * sep = LEAN_NAME_SEPARATOR);
 }
--- a/src/kernel/pp.cpp
+++ b/src/kernel/pp.cpp
@ -17,7 +17,7 @@ struct pp_fn {
    pp_fn(environment const * env):m_env(env) {}

    unsigned indent() const { return 2; }
-    char const * sep() const { return default_name_separator; }
+    char const * sep() const { return LEAN_NAME_SEPARATOR; }
    format pp_keyword(char const * k) { return highlight(format(k), format::format_color::ORANGE); }
    format pp_type_kwd() { return highlight(format("Type"), format::format_color::PINK); }
    format pp_eq_kwd() { return highlight(format(" = "), format::format_color::BLUE); }
--- a/src/tests/frontend/CMakeLists.txt
+++ b/src/tests/frontend/CMakeLists.txt
@ -1,3 +1,6 @@
 add_executable(frontend_tst frontend.cpp)
 target_link_libraries(frontend_tst ${EXTRA_LIBS})
 add_test(frontend ${CMAKE_CURRENT_BINARY_DIR}/frontend_tst)
+add_executable(scanner scanner.cpp)
+target_link_libraries(scanner ${EXTRA_LIBS})
+add_test(scanner ${CMAKE_CURRENT_BINARY_DIR}/scanner)
--- a/src/tests/util/CMakeLists.txt
+++ b/src/tests/util/CMakeLists.txt
@ -4,9 +4,9 @@ add_test(interrupt ${CMAKE_CURRENT_BINARY_DIR}/interrupt)
 add_executable(name name.cpp)
 target_link_libraries(name ${EXTRA_LIBS})
 add_test(name ${CMAKE_CURRENT_BINARY_DIR}/name)
-add_executable(sexpr sexpr.cpp)
-target_link_libraries(sexpr ${EXTRA_LIBS})
-add_test(sexpr ${CMAKE_CURRENT_BINARY_DIR}/sexpr)
+add_executable(sexpr_tst sexpr.cpp)
+target_link_libraries(sexpr_tst ${EXTRA_LIBS})
+add_test(sexpr ${CMAKE_CURRENT_BINARY_DIR}/sexpr_tst)
 add_executable(format format.cpp)
 target_link_libraries(format ${EXTRA_LIBS})
 add_test(format ${CMAKE_CURRENT_BINARY_DIR}/format)
--- a/src/util/CMakeLists.txt
+++ b/src/util/CMakeLists.txt
@ -1,4 +1,3 @@
 add_library(util trace.cpp debug.cpp name.cpp exception.cpp
  verbosity.cpp interrupt.cpp hash.cpp escaped.cpp bit_tricks.cpp
-  sexpr.cpp sexpr_funcs.cpp format.cpp safe_arith.cpp options.cpp
-  )
+  safe_arith.cpp)
--- a/src/util/name.cpp
+++ b/src/util/name.cpp
@ -13,14 +13,8 @@ Author: Leonardo de Moura
 #include "rc.h"
 #include "hash.h"
 #include "trace.h"
-#include "options.h"
-
-#ifndef LEAN_DEFAULT_NAME_SEPARATOR
-#define LEAN_DEFAULT_NAME_SEPARATOR "::"
-#endif

 namespace lean {
-
 constexpr char const * anonymous_str = "[anonymous]";

 /**
@ -269,7 +263,7 @@ size_t name::size(char const * sep) const {
 }

 size_t name::size() const {
-    return size(LEAN_DEFAULT_NAME_SEPARATOR);
+    return size(LEAN_NAME_SEPARATOR);
 }

 unsigned name::hash() const {
@ -283,11 +277,11 @@ std::string name::to_string(char const * sep) const {
 }

 std::string name::to_string() const {
-    return to_string(LEAN_DEFAULT_NAME_SEPARATOR);
+    return to_string(LEAN_NAME_SEPARATOR);
 }

 std::ostream & operator<<(std::ostream & out, name const & n) {
-    name::imp::display(out, LEAN_DEFAULT_NAME_SEPARATOR, n.m_ptr);
+    name::imp::display(out, LEAN_NAME_SEPARATOR, n.m_ptr);
    return out;
 }

@ -297,12 +291,6 @@ std::ostream & operator<<(std::ostream & out, name::sep const & s) {
    name::imp::display(out, s.m_sep, s.m_name.m_ptr);
    return out;
 }
-
-static name g_name_separator{"name", "separator"};
-
-char const * get_name_separator(options const & o) {
-    return o.get_string(g_name_separator, LEAN_DEFAULT_NAME_SEPARATOR);
-}
 }

 void print(lean::name const & n) { std::cout << n << std::endl; }
--- a/src/util/name.h
+++ b/src/util/name.h
@ -7,11 +7,12 @@ Author: Leonardo de Moura
 #pragma once
 #include <iostream>

+#ifndef LEAN_NAME_SEPARATOR
+#define LEAN_NAME_SEPARATOR "::"
+#endif
+
 namespace lean {
-
-constexpr char const * default_name_separator = "::";
 enum class name_kind { ANONYMOUS, STRING, NUMERAL };
-
 /**
   \brief Hierarchical names.
 */
@ -73,8 +74,4 @@ public:
 };
 struct name_hash { unsigned operator()(name const & n) const { return n.hash(); } };
 struct name_eq { bool operator()(name const & n1, name const & n2) const { return n1 == n2; } };
-
-class options;
-/** \brief Return the separator for hierarchical names */
-char const * get_name_separator(options const & o);
 }
--- a/src/util/sexpr/format.cpp
+++ b/src/util/sexpr/format.cpp
--- a/src/util/sexpr/format.h
+++ b/src/util/sexpr/format.h
--- a/src/util/sexpr/options.cpp
+++ b/src/util/sexpr/options.cpp
@ -106,4 +106,8 @@ std::ostream & operator<<(std::ostream & out, options const & o) {
    out << g_right_angle_bracket;
    return out;
 }
+static name g_name_separator{"name", "separator"};
+char const * get_name_separator(options const & o) {
+    return o.get_string(g_name_separator, "::");
+}
 }
--- a/src/util/sexpr/options.h
+++ b/src/util/sexpr/options.h
@ -51,4 +51,6 @@ public:
 };
 template<typename T> options update(options const & o, name const & n, T const & v) { return o.update(n, sexpr(v)); }
 template<typename T> options update(options const & o, char const * n, T const & v) { return o.update(name(n), sexpr(v)); }
+/** \brief Return the separator for hierarchical names */
+char const * get_name_separator(options const & o);
 }
--- a/src/util/sexpr/sexpr.cpp
+++ b/src/util/sexpr/sexpr.cpp
--- a/src/util/sexpr/sexpr.h
+++ b/src/util/sexpr/sexpr.h
--- a/src/util/sexpr/sexpr_funcs.cpp
+++ b/src/util/sexpr/sexpr_funcs.cpp
--- a/src/util/sexpr/sexpr_funcs.h
+++ b/src/util/sexpr/sexpr_funcs.h