From 189a300b118bda08a40efb0750ab8adc4c57a164 Mon Sep 17 00:00:00 2001
From: Sebastian Ullrich <sebasti@nullri.ch>
Date: Mon, 28 Sep 2015 01:17:46 +0200
Subject: [PATCH] feat(frontends/lean): improve pretty printing space insertion
 heuristic

---
 src/frontends/lean/pp.cpp      | 110 ++++++++++++++++-----------------
 src/frontends/lean/pp.h        |   1 +
 src/frontends/lean/scanner.cpp |   8 +--
 src/frontends/lean/scanner.h   |   1 +
 src/util/sexpr/format.cpp      |  58 +++++++++++++++++
 src/util/sexpr/format.h        |   6 ++
 src/util/sexpr/sexpr_fn.h      |   3 +-
 src/util/utf8.cpp              |  12 ++++
 src/util/utf8.h                |   1 +
 9 files changed, 140 insertions(+), 60 deletions(-)
diff --git a/src/frontends/lean/pp.cpp b/src/frontends/lean/pp.cpp
index 77aaf2869..38705b72f 100644
--- a/src/frontends/lean/pp.cpp
+++ b/src/frontends/lean/pp.cpp
@@ -6,6 +6,8 @@ Author: Leonardo de Moura
 */
 #include <algorithm>
 #include <limits>
+#include <string>
+#include <util/utf8.h>
 #include "util/flet.h"
 #include "kernel/replace_fn.h"
 #include "kernel/free_vars.h"
@@ -36,6 +38,7 @@ Author: Leonardo de Moura
 #include "frontends/lean/builtin_exprs.h"
 #include "frontends/lean/parser_config.h"
 #include "frontends/lean/local_ref_info.h"
+#include "frontends/lean/scanner.h"
 
 namespace lean {
 static format * g_ellipsis_n_fmt  = nullptr;
@@ -996,20 +999,6 @@ auto pretty_fn::pp_notation_child(expr const & e, unsigned lbp, unsigned rbp) ->
     }
 }
 
-static bool add_extra_space_first(name const & tk) {
-    // TODO(Leo): this is a hard-coded temporary solution for deciding whether extra
-    // spaces should be added or not when pretty printing notation.
-    // We should implement a better solution in the future.
-    return tk != "(" && tk != ")" && tk != "[";
-}
-
-static bool add_extra_space(name const & tk) {
-    // TODO(Leo): this is a hard-coded temporary solution for deciding whether extra
-    // spaces should be added or not when pretty printing notation.
-    // We should implement a better solution in the future.
-    return tk != "," && tk != "(" && tk != ")" && tk != "[";
-}
-
 static bool is_atomic_notation(notation_entry const & entry) {
     if (!entry.is_nud())
         return false;
@@ -1034,21 +1023,18 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer<optional<expr>>
         unsigned i         = ts.size();
         unsigned last_rbp  = inf_bp()-1;
         unsigned token_lbp = 0;
-        bool extra_space   = false;
-        bool last_is_skip  = false;
         bool last          = true;
         while (i > 0) {
             --i;
             format curr;
             notation::action const & a = ts[i].get_action();
             name const & tk = ts[i].get_token();
+            format tk_fmt = format(tk);
             switch (a.kind()) {
             case notation::action_kind::Skip:
-                curr = format(tk);
-                if (last) {
-                    last_rbp     = inf_bp();
-                    last_is_skip = true;
-                }
+                curr = tk_fmt;
+                if (last)
+                    last_rbp = inf_bp();
                 break;
             case notation::action_kind::Expr:
                 if (args.empty() || !args.back()) {
@@ -1057,14 +1043,7 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer<optional<expr>>
                     expr e = *args.back();
                     args.pop_back();
                     result e_r   = pp_notation_child(e, token_lbp, a.rbp());
-                    format e_fmt = e_r.fmt();
-                    curr = format(tk);
-                    // we add space after the token only when
-                    // 1- add_extra_space(tk) is true AND
-                    // 2- tk is the first token in a nud notation
-                    if (add_extra_space(tk) && (!entry.is_nud() || i != 0 || m_extra_spaces))
-                        curr = curr + space();
-                    curr = curr + e_fmt;
+                    curr = tk_fmt + e_r.fmt();
                     if (last)
                         last_rbp = a.rbp();
                     break;
@@ -1109,8 +1088,6 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer<optional<expr>>
                     unsigned curr_lbp = token_lbp;
                     if (auto t = a.get_terminator()) {
                         curr = format(*t);
-                        if (add_extra_space(*t) && m_extra_spaces)
-                            curr = space() + curr;
                         curr_lbp = get_some_precedence(m_token_table, *t);
                     }
                     unsigned j       = rec_args.size();
@@ -1120,15 +1097,10 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer<optional<expr>>
                         --j;
                         result arg_res = pp_notation_child(rec_args[j], curr_lbp, a.rbp());
                         if (j == 0) {
-                            if (add_extra_space_first(tk) && (!entry.is_nud() || i != 0 || m_extra_spaces))
-                                curr = format(tk) + space() + arg_res.fmt() + curr;
-                            else
-                                curr = format(tk) + arg_res.fmt() + curr;
+                            curr = tk_fmt + arg_res.fmt() + curr;
                         } else {
-                            curr = sep_fmt + space() + arg_res.fmt() + curr;
+                            curr = sep_fmt + arg_res.fmt() + curr;
                         }
-                        if (j > 0 && add_extra_space(a.get_sep()))
-                            curr = space() + curr;
                         curr_lbp = sep_lbp;
                     }
                     break;
@@ -1136,12 +1108,10 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer<optional<expr>>
             case notation::action_kind::Binder:
                 if (locals.size() != 1)
                     return optional<result>();
-                curr = format(tk) + pp_binder(locals[0]);
-                if (!last)
-                    curr = curr + space();
+                curr = tk_fmt + pp_binder(locals[0]);
                 break;
             case notation::action_kind::Binders:
-                curr = format(tk) + pp_binders(locals);
+                curr = tk_fmt + pp_binders(locals);
                 break;
             case notation::action_kind::ScopedExpr:
                 if (args.empty() || !args.back()) {
@@ -1177,8 +1147,7 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer<optional<expr>>
                     if (locals.empty())
                         return optional<result>();
                     result e_r   = pp_notation_child(e, token_lbp, a.rbp());
-                    format e_fmt = e_r.fmt();
-                    curr = format(tk) + space() + e_fmt;
+                    curr = tk_fmt + e_r.fmt();
                     if (last)
                         last_rbp = a.rbp();
                     break;
@@ -1192,12 +1161,8 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer<optional<expr>>
                 fmt = curr;
                 last = false;
             } else {
-                if (extra_space)
-                    curr = curr + space();
                 fmt = curr + fmt;
             }
-            if (m_extra_spaces || !last_is_skip)
-                extra_space = add_extra_space(tk);
         }
         unsigned first_lbp = inf_bp();
         if (!entry.is_nud()) {
@@ -1208,10 +1173,7 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer<optional<expr>>
             expr e = *args.back();
             args.pop_back();
             format e_fmt = pp_notation_child(e, token_lbp, 0).fmt();
-            if (m_extra_spaces || !last_is_skip)
-                fmt = e_fmt + space() + fmt;
-            else
-                fmt = e_fmt + fmt;
+            fmt = e_fmt + fmt;
         }
         return optional<result>(result(first_lbp, last_rbp, fmt));
     }
@@ -1221,7 +1183,7 @@ auto pretty_fn::pp_notation(expr const & e) -> optional<result> {
     if (!m_notation || is_var(e))
         return optional<result>();
     for (notation_entry const & entry : get_notation_entries(m_env, head_index(e))) {
-        if (entry.group() != notation_entry_group::Main)
+        if (entry.group() == notation_entry_group::Reserve)
             continue;
         if (!m_unicode && !entry.is_safe_ascii())
             continue; // ignore this notation declaration since unicode support is not enabled
@@ -1338,12 +1300,50 @@ class pp_beta_reduce_fn : public replace_visitor {
     }
 };
 
+std::string sexpr_to_string(sexpr const & s) {
+    if (is_string(s))
+        return to_string(s);
+    std::stringstream ss;
+    ss << s;
+    return ss.str();
+}
+
+// check whether a space must be inserted between the strings so that lexing them would
+// produce separate tokens
+bool pretty_fn::needs_space_sep(std::string const & s1, std::string const & s2) const {
+    if (is_id_rest(get_utf8_last_char(s1.data()), s1.data() + s1.size()) && is_id_rest(s2.data(), s2.data() + s2.size()))
+        return true; // would be lexed as a single identifier without space
+
+
+    // check whether s1 + s2 has a longer prefix in the token table than s1
+    token_table const * t = &m_token_table;
+    for (char c : s1) {
+        t = t->find(c);
+        if (!t)
+            return false; // s1 must be an identifier, and we know s2 does not start with is_id_rest
+    }
+    for (char c : s2) {
+        t = t->find(c);
+        if (!t)
+            return false;
+        if (t->value())
+            return true;
+    }
+    return true; // the next identifier may expand s1 + s2 to a token
+}
+
 format pretty_fn::operator()(expr const & e) {
     m_depth = 0; m_num_steps = 0;
+    result r;
     if (m_beta)
-        return pp_child(purify(pp_beta_reduce_fn()(e)), 0).fmt();
+        r = pp_child(purify(pp_beta_reduce_fn()(e)), 0);
     else
-        return pp_child(purify(e), 0).fmt();
+        r = pp_child(purify(e), 0);
+
+    // insert spaces so that lexing the result round-trips
+    std::function<bool(sexpr const &, sexpr const &)> sep; // NOLINT
+    sep = [&](sexpr const & s1, sexpr const & s2) { return needs_space_sep(sexpr_to_string(s1), sexpr_to_string(s2)); };
+    return r.fmt().separate_tokens(sep);
 }
 
 formatter_factory mk_pretty_formatter_factory() {
diff --git a/src/frontends/lean/pp.h b/src/frontends/lean/pp.h
index d87fbf0e1..eb35cf89a 100644
--- a/src/frontends/lean/pp.h
+++ b/src/frontends/lean/pp.h
@@ -93,6 +93,7 @@ private:
     optional<result> pp_notation(expr const & e);
 
     result add_paren_if_needed(result const & r, unsigned bp);
+    bool needs_space_sep(std::string const &s1, std::string const &s2) const;
 
     result pp_overriden_local_ref(expr const & e);
     bool ignore_local_ref(expr const & e);
diff --git a/src/frontends/lean/scanner.cpp b/src/frontends/lean/scanner.cpp
index f6d6f78fe..87ebd3903 100644
--- a/src/frontends/lean/scanner.cpp
+++ b/src/frontends/lean/scanner.cpp
@@ -314,10 +314,10 @@ static bool is_id_first(buffer<char> const & cs, unsigned i) {
     return is_letter_like_unicode(u);
 }
 
-static bool is_id_rest(buffer<char> const & cs, unsigned i)  {
-    if (std::isalnum(cs[i]) || cs[i] == '_' || cs[i] == '\'')
+bool is_id_rest(char const * begin, char const * end) {
+    if (std::isalnum(*begin) || *begin == '_' || *begin == '\'')
         return true;
-    unsigned u = utf8_to_unicode(cs.begin() + i, cs.end());
+    unsigned u = utf8_to_unicode(begin, end);
     return is_letter_like_unicode(u) || is_super_sub_script_alnum_unicode(u);
 }
 
@@ -337,7 +337,7 @@ auto scanner::read_key_cmd_id() -> token_kind {
             unsigned i = id_sz;
             next_utf(cs);
             num_utfs++;
-            if (is_id_rest(cs, i)) {
+            if (is_id_rest(&cs[i], cs.end())) {
             } else if (cs[i] == '.') {
                 next_utf(cs);
                 num_utfs++;
diff --git a/src/frontends/lean/scanner.h b/src/frontends/lean/scanner.h
index 641a4e281..cdc20267b 100644
--- a/src/frontends/lean/scanner.h
+++ b/src/frontends/lean/scanner.h
@@ -92,6 +92,7 @@ public:
     };
 };
 std::ostream & operator<<(std::ostream & out, scanner::token_kind k);
+bool is_id_rest(char const * begin, char const * end);
 void initialize_scanner();
 void finalize_scanner();
 }
diff --git a/src/util/sexpr/format.cpp b/src/util/sexpr/format.cpp
index 95c948aca..200d3cf25 100644
--- a/src/util/sexpr/format.cpp
+++ b/src/util/sexpr/format.cpp
@@ -177,6 +177,64 @@ format wrap(format const & f1, format const & f2) {
     return f1 + choice(format(" "), line()) + f2;
 }
 
+std::tuple<sexpr, sexpr const *> format::separate_tokens(sexpr const & s, sexpr const * last,
+                                                         std::function<bool(sexpr const &, sexpr const &)> sep // NOLINT
+) const {
+    switch (sexpr_kind(s)) {
+        case format_kind::NIL:
+        case format_kind::LINE:
+        case format_kind::COLOR_BEGIN:
+        case format_kind::COLOR_END:
+            return std::make_tuple(s, last);
+        case format_kind::COMPOSE:
+        case format_kind::FLAT_COMPOSE:
+        {
+            sexpr list = sexpr_compose_list(s);
+            list = map(list, [&](sexpr const & s) {
+                sexpr t;
+                std::tie(t, last) = separate_tokens(s, last, sep);
+                return t;
+            });
+            sexpr t = sexpr_kind(m_value) == format_kind::COMPOSE ? sexpr_compose(list) : sexpr_flat_compose(list);
+            return std::make_tuple(t, last);
+        }
+        case format_kind::NEST:
+        {
+            sexpr t;
+            std::tie(t, last) = separate_tokens(sexpr_nest_s(s), last, sep);
+            return std::make_tuple(sexpr_nest(sexpr_nest_i(s), t), last);
+        }
+        case format_kind::TEXT:
+        {
+            sexpr const & text = sexpr_text_t(s);
+            if (last && sep(*last, text))
+                return std::make_tuple(sexpr_compose({*g_sexpr_space, s}), &text);
+            else
+                return std::make_tuple(s, &text);
+        }
+        case format_kind::CHOICE:
+        {
+            // we assume that choices only differ in spacing and thus share their last token
+            sexpr s1, s2; sexpr const * last1, * last2;
+            std::tie(s1, last1) = separate_tokens(sexpr_choice_1(s), last, sep);
+            std::tie(s2, last2) = separate_tokens(sexpr_choice_2(s), last, sep);
+            lean_assert(last1 == last2 || (last1 && last2 && *last1 == *last2));
+            return std::make_tuple(sexpr_choice(s1, s2), last1);
+        }
+    }
+    lean_unreachable(); // LCOV_EXCL_LINE
+}
+
+/**
+   \brief Replaces every text sepxr \c t with <tt>compose(" ", t)</tt> if there is a preceding
+   text sexpr \c s and <tt>sep(s, t)</tt> is true
+*/
+format format::separate_tokens(
+            std::function<bool(sexpr const &, sexpr const &)> sep // NOLINT
+) const {
+    return format(std::get<0>(separate_tokens(m_value, nullptr, sep)));
+}
+
 /**
    \brief Auxiliary exception used to sign that the amount of
    available space was exhausted. It is used in \c space_upto_line_break and
diff --git a/src/util/sexpr/format.h b/src/util/sexpr/format.h
index fb86fce40..fffc0587f 100644
--- a/src/util/sexpr/format.h
+++ b/src/util/sexpr/format.h
@@ -120,6 +120,10 @@ private:
         return sexpr(sexpr(format::format_kind::LINE), sexpr());
     }
 
+    std::tuple<sexpr, sexpr const *> separate_tokens(sexpr const & s, sexpr const * last,
+                                                     std::function<bool(sexpr const &, sexpr const &)> sep //NOLINT
+    ) const;
+
     // Functions used inside of pretty printing
     static bool space_upto_line_break_list_exceeded(sexpr const & s, int available, std::vector<pair<sexpr, unsigned>> const & todo);
     static int space_upto_line_break(sexpr const & s, int available, bool & found);
@@ -169,6 +173,8 @@ public:
     bool is_nil_fmt() const { return kind() == format_kind::NIL; }
     unsigned hash() const { return m_value.hash(); }
 
+    format separate_tokens(std::function<bool(sexpr const &, sexpr const &)> sep) const; // NOLINT
+
     friend format compose(format const & f1, format const & f2);
     friend format nest(int i, format const & f);
     friend format highlight(format const & f, format::format_color const c);
diff --git a/src/util/sexpr/sexpr_fn.h b/src/util/sexpr/sexpr_fn.h
index 15e6de8bf..9460a11a1 100644
--- a/src/util/sexpr/sexpr_fn.h
+++ b/src/util/sexpr/sexpr_fn.h
@@ -32,7 +32,8 @@ sexpr map(sexpr const & l, F f) {
         return l;
     } else {
         lean_assert(is_cons(l));
-        return sexpr(f(head(l)), map(tail(l), f));
+        auto x = f(head(l)); // force left-to-right evaluation order
+        return sexpr(x, map(tail(l), f));
     }
 }
 
diff --git a/src/util/utf8.cpp b/src/util/utf8.cpp
index 11dd3e66b..5cbd8d9af 100644
--- a/src/util/utf8.cpp
+++ b/src/util/utf8.cpp
@@ -5,6 +5,7 @@ Released under Apache 2.0 license as described in the file LICENSE.
 Author: Leonardo de Moura
 */
 #include <cstdlib>
+#include "util/debug.h"
 
 namespace lean {
 bool is_utf8_next(unsigned char c) { return (c & 0xC0) == 0x80; }
@@ -37,4 +38,15 @@ size_t utf8_strlen(char const * str) {
     }
     return r;
 }
+
+char const * get_utf8_last_char(char const * str) {
+    char const * r;
+    lean_assert(*str != 0);
+    do {
+        r = str;
+        unsigned sz = get_utf8_size(*str);
+        str += sz;
+    } while (*str != 0);
+    return r;
+}
 }
diff --git a/src/util/utf8.h b/src/util/utf8.h
index 3641952cf..75b2d4854 100644
--- a/src/util/utf8.h
+++ b/src/util/utf8.h
@@ -9,4 +9,5 @@ namespace lean {
 bool is_utf8_next(unsigned char c);
 unsigned get_utf8_size(unsigned char c);
 size_t utf8_strlen(char const * str);
+char const * get_utf8_last_char(char const * str);
 }