refactor(frontends/lean/scanner): remove dependency to seekg and unget

methods It is not safe to use seekg for textual files. Here is a fragment from a C++ manual: seekg() and seekp() This pair of functions serve respectively to change the position of stream pointers get and put. Both functions are overloaded with two different prototypes: seekg ( pos_type position ); seekp ( pos_type position ); Using this prototype the stream pointer is changed to an absolute position from the beginning of the file. The type required is the same as that returned by functions tellg and tellp. seekg ( off_type offset, seekdir direction ); seekp ( off_type offset, seekdir direction ); Using this prototype, an offset from a concrete point determined by parameter direction can be specified. It can be: ios::beg offset specified from the beginning of the stream ios::cur offset specified from the current position of the stream pointer ios::end offset specified from the end of the stream The values of both stream pointers get and put are counted in different ways for text files than for binary files, since in text mode files some modifications to the appearance of some special characters can occur. For that reason it is advisable to use only the first prototype of seekg and seekp with files opened in text mode and always use non-modified values returned by tellg or tellp. With binary files, you can freely use all the implementations for these functions. They should not have any unexpected behavior.
2014-09-18 15:24:48 -07:00 · 2014-09-18 15:24:48 -07:00 · b3e05a2fe9
commit b3e05a2fe9
parent 243f80231a
3 changed files with 49 additions and 28 deletions
--- a/src/frontends/lean/scanner.cpp
+++ b/src/frontends/lean/scanner.cpp
@ -80,12 +80,33 @@ bool is_super_sub_script_alnum_unicode(unsigned u) {

 void scanner::next() {
    lean_assert(m_curr != EOF);
-    m_curr = m_stream.get();
    m_spos++;
-    if (m_uskip > 0) {
-        if (!is_utf8_next(m_curr)) {
-            throw_exception("invalid utf-8 sequence character");
+    while (m_spos >= static_cast<int>(m_curr_line.size())) {
+        if (m_last_line) {
+            m_curr = EOF;
+            return;
+        } else {
+            m_curr_line.clear();
+            if (std::getline(m_stream, m_curr_line)) {
+                m_curr_line.push_back('\n');
+                m_sline++;
+                m_spos  = 0;
+                m_upos  = 0;
+                m_curr  = m_curr_line[m_spos];
+                m_uskip = get_utf8_size(m_curr);
+                m_uskip--;
+                return;
+            } else {
+                m_last_line = true;
+                m_curr      = EOF;
+                return;
            }
+        }
+    }
+    m_curr = m_curr_line[m_spos];
+    if (m_uskip > 0) {
+        if (!is_utf8_next(m_curr))
+            throw_exception("invalid utf-8 sequence character");
        m_uskip--;
    } else {
        m_upos++;
@ -113,7 +134,6 @@ auto scanner::read_string() -> token_kind {
    m_buffer.clear();
    while (true) {
        check_not_eof(end_error_msg);
-        update_line();
        char c = curr();
        if (c == '\"') {
            next();
@ -155,10 +175,10 @@ auto scanner::read_quoted_symbol() -> token_kind {

 bool scanner::is_next_digit() {
    lean_assert(curr() != EOF);
-    char c = m_stream.get();
-    bool r = std::isdigit(c);
-    m_stream.unget();
-    return r;
+    if (m_spos + 1 < static_cast<int>(m_curr_line.size()))
+        return std::isdigit(m_curr_line[m_spos+1]);
+    else
+        return false;
 }

 auto scanner::read_number() -> token_kind {
@ -198,7 +218,6 @@ void scanner::read_single_line_comment() {
    while (true) {
        if (curr() == '\n') {
            next();
-            new_line();
            return;
        } else if (curr() == EOF) {
            return;
@ -218,7 +237,6 @@ bool scanner::consume(char const * str, char const * error_msg) {
            if (!str[i])
                return true;
            check_not_eof(error_msg);
-            update_line();
            if (curr_next() != str[i])
                return false;
            i++;
@ -244,7 +262,6 @@ void scanner::read_comment_block() {
                return;
        }
        check_not_eof(end_error_msg);
-        update_line();
        next();
    }
 }
@ -257,7 +274,6 @@ void scanner::read_until(char const * end_str, char const * error_msg) {
    m_buffer.clear();
    while (true) {
        check_not_eof(error_msg);
-        update_line();
        char c = curr_next();
        if (c == end_str[0]) {
            m_aux_buffer.clear();
@ -267,7 +283,6 @@ void scanner::read_until(char const * end_str, char const * error_msg) {
                if (!end_str[i])
                    return;
                check_not_eof(error_msg);
-                update_line();
                c = curr_next();
                if (c != end_str[i]) {
                    m_buffer += m_aux_buffer;
@ -291,14 +306,12 @@ void scanner::move_back(unsigned offset, unsigned u_offset) {
    if (offset != 0) {
        if (curr() == EOF) {
            m_curr = 0;
-            m_stream.clear();
            m_spos--;
            m_upos--;
            offset--;
            u_offset--;
        }
        if (offset != 0) {
-            m_stream.seekg(-static_cast<std::streamoff>(offset), std::ios_base::cur);
            m_spos -= offset;
            m_upos -= u_offset;
        }
@ -431,12 +444,9 @@ auto scanner::scan(environment const & env) -> token_kind {
        m_pos  = m_upos;
        m_line = m_sline;
        switch (c) {
-        case ' ': case '\r': case '\t':
+        case ' ': case '\r': case '\t': case '\n':
            next();
            break;
-        case '\n':
-            next(); new_line();
-            break;
        case '\"':
            return read_string();
        case '`':
@ -468,12 +478,23 @@ auto scanner::scan(environment const & env) -> token_kind {
 }

 scanner::scanner(std::istream & strm, char const * strm_name, unsigned line):
-    m_tokens(nullptr), m_stream(strm), m_spos(0), m_upos(0), m_uskip(0), m_sline(line), m_curr(0), m_pos(0), m_line(line),
-    m_token_info(nullptr) {
+    m_tokens(nullptr), m_stream(strm), m_token_info(nullptr) {
    m_stream_name = strm_name ? strm_name : "[unknown]";
-    next();
+    m_sline = line;
+    m_line  = line;
    m_spos  = 0;
    m_upos  = 0;
+    if (std::getline(m_stream, m_curr_line)) {
+        m_last_line = false;
+        m_curr_line.push_back('\n');
+        m_curr  = m_curr_line[m_spos];
+        m_uskip = get_utf8_size(m_curr);
+        m_uskip--;
+    } else {
+        m_last_line = true;
+        m_curr      = EOF;
+        m_uskip     = 0;
+    }
 }

 std::ostream & operator<<(std::ostream & out, scanner::token_kind k) {
--- a/src/frontends/lean/scanner.h
+++ b/src/frontends/lean/scanner.h
@ -27,6 +27,8 @@ protected:
    token_table const * m_tokens;
    std::istream &      m_stream;
    std::string         m_stream_name;
+    std::string         m_curr_line;
+    bool                m_last_line;

    int                 m_spos;  // current position
    int                 m_upos;  // current position taking into account utf-8 encoding
@ -47,8 +49,6 @@ protected:
    void next();
    char curr() const { return m_curr; }
    char curr_next() { char c = curr(); next(); return c; }
-    void new_line() { m_sline++; m_spos = 0; m_upos = 0; }
-    void update_line() { if (curr() == '\n') new_line(); }
    void check_not_eof(char const * error_msg);
    bool is_next_digit();
    bool is_next_id_rest();
--- a/tests/lean/interactive/in4.input.expected.out
+++ b/tests/lean/interactive/in4.input.expected.out
@ -11,7 +11,7 @@ rfl
 -- AFTER REMOVE 8&9
 -- BEGININFO STALE NAY
 -- ENDINFO
-- BEGININFO NAY
+-- BEGININFO STALE NAY
 -- ENDINFO
 -- BEGINWAIT
 -- ENDWAIT