feat(frontends/lean): improve pretty printing space insertion heuristic

2015-09-28 01:17:46 +02:00 · 2015-09-28 01:17:46 +02:00 · 189a300b11
commit 189a300b11
parent 8fd8ff2773
9 changed files with 140 additions and 60 deletions
--- a/src/frontends/lean/pp.cpp
+++ b/src/frontends/lean/pp.cpp
@ -6,6 +6,8 @@ Author: Leonardo de Moura
 */
 #include <algorithm>
 #include <limits>
+#include <string>
+#include <util/utf8.h>
 #include "util/flet.h"
 #include "kernel/replace_fn.h"
 #include "kernel/free_vars.h"
@ -36,6 +38,7 @@ Author: Leonardo de Moura
 #include "frontends/lean/builtin_exprs.h"
 #include "frontends/lean/parser_config.h"
 #include "frontends/lean/local_ref_info.h"
+#include "frontends/lean/scanner.h"

 namespace lean {
 static format * g_ellipsis_n_fmt  = nullptr;
@ -996,20 +999,6 @@ auto pretty_fn::pp_notation_child(expr const & e, unsigned lbp, unsigned rbp) ->
    }
 }

-static bool add_extra_space_first(name const & tk) {
-    // TODO(Leo): this is a hard-coded temporary solution for deciding whether extra
-    // spaces should be added or not when pretty printing notation.
-    // We should implement a better solution in the future.
-    return tk != "(" && tk != ")" && tk != "[";
-}
-
-static bool add_extra_space(name const & tk) {
-    // TODO(Leo): this is a hard-coded temporary solution for deciding whether extra
-    // spaces should be added or not when pretty printing notation.
-    // We should implement a better solution in the future.
-    return tk != "," && tk != "(" && tk != ")" && tk != "[";
-}
-
 static bool is_atomic_notation(notation_entry const & entry) {
    if (!entry.is_nud())
        return false;
@ -1034,21 +1023,18 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer<optional<expr>>
        unsigned i         = ts.size();
        unsigned last_rbp  = inf_bp()-1;
        unsigned token_lbp = 0;
-        bool extra_space   = false;
-        bool last_is_skip  = false;
        bool last          = true;
        while (i > 0) {
            --i;
            format curr;
            notation::action const & a = ts[i].get_action();
            name const & tk = ts[i].get_token();
+            format tk_fmt = format(tk);
            switch (a.kind()) {
            case notation::action_kind::Skip:
-                curr = format(tk);
-                if (last) {
-                    last_rbp     = inf_bp();
-                    last_is_skip = true;
-                }
+                curr = tk_fmt;
+                if (last)
+                    last_rbp = inf_bp();
                break;
            case notation::action_kind::Expr:
                if (args.empty() || !args.back()) {
@ -1057,14 +1043,7 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer<optional<expr>>
                    expr e = *args.back();
                    args.pop_back();
                    result e_r   = pp_notation_child(e, token_lbp, a.rbp());
-                    format e_fmt = e_r.fmt();
-                    curr = format(tk);
-                    // we add space after the token only when
-                    // 1- add_extra_space(tk) is true AND
-                    // 2- tk is the first token in a nud notation
-                    if (add_extra_space(tk) && (!entry.is_nud() || i != 0 || m_extra_spaces))
-                        curr = curr + space();
-                    curr = curr + e_fmt;
+                    curr = tk_fmt + e_r.fmt();
                    if (last)
                        last_rbp = a.rbp();
                    break;
@ -1109,8 +1088,6 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer<optional<expr>>
                    unsigned curr_lbp = token_lbp;
                    if (auto t = a.get_terminator()) {
                        curr = format(*t);
-                        if (add_extra_space(*t) && m_extra_spaces)
-                            curr = space() + curr;
                        curr_lbp = get_some_precedence(m_token_table, *t);
                    }
                    unsigned j       = rec_args.size();
@ -1120,15 +1097,10 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer<optional<expr>>
                        --j;
                        result arg_res = pp_notation_child(rec_args[j], curr_lbp, a.rbp());
                        if (j == 0) {
-                            if (add_extra_space_first(tk) && (!entry.is_nud() || i != 0 || m_extra_spaces))
-                                curr = format(tk) + space() + arg_res.fmt() + curr;
-                            else
-                                curr = format(tk) + arg_res.fmt() + curr;
+                            curr = tk_fmt + arg_res.fmt() + curr;
                        } else {
-                            curr = sep_fmt + space() + arg_res.fmt() + curr;
+                            curr = sep_fmt + arg_res.fmt() + curr;
                        }
-                        if (j > 0 && add_extra_space(a.get_sep()))
-                            curr = space() + curr;
                        curr_lbp = sep_lbp;
                    }
                    break;
@ -1136,12 +1108,10 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer<optional<expr>>
            case notation::action_kind::Binder:
                if (locals.size() != 1)
                    return optional<result>();
-                curr = format(tk) + pp_binder(locals[0]);
-                if (!last)
-                    curr = curr + space();
+                curr = tk_fmt + pp_binder(locals[0]);
                break;
            case notation::action_kind::Binders:
-                curr = format(tk) + pp_binders(locals);
+                curr = tk_fmt + pp_binders(locals);
                break;
            case notation::action_kind::ScopedExpr:
                if (args.empty() || !args.back()) {
@ -1177,8 +1147,7 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer<optional<expr>>
                    if (locals.empty())
                        return optional<result>();
                    result e_r   = pp_notation_child(e, token_lbp, a.rbp());
-                    format e_fmt = e_r.fmt();
-                    curr = format(tk) + space() + e_fmt;
+                    curr = tk_fmt + e_r.fmt();
                    if (last)
                        last_rbp = a.rbp();
                    break;
@ -1192,12 +1161,8 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer<optional<expr>>
                fmt = curr;
                last = false;
            } else {
-                if (extra_space)
-                    curr = curr + space();
                fmt = curr + fmt;
            }
-            if (m_extra_spaces || !last_is_skip)
-                extra_space = add_extra_space(tk);
        }
        unsigned first_lbp = inf_bp();
        if (!entry.is_nud()) {
@ -1208,10 +1173,7 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer<optional<expr>>
            expr e = *args.back();
            args.pop_back();
            format e_fmt = pp_notation_child(e, token_lbp, 0).fmt();
-            if (m_extra_spaces || !last_is_skip)
-                fmt = e_fmt + space() + fmt;
-            else
-                fmt = e_fmt + fmt;
+            fmt = e_fmt + fmt;
        }
        return optional<result>(result(first_lbp, last_rbp, fmt));
    }
@ -1221,7 +1183,7 @@ auto pretty_fn::pp_notation(expr const & e) -> optional<result> {
    if (!m_notation || is_var(e))
        return optional<result>();
    for (notation_entry const & entry : get_notation_entries(m_env, head_index(e))) {
-        if (entry.group() != notation_entry_group::Main)
+        if (entry.group() == notation_entry_group::Reserve)
            continue;
        if (!m_unicode && !entry.is_safe_ascii())
            continue; // ignore this notation declaration since unicode support is not enabled
@ -1338,12 +1300,50 @@ class pp_beta_reduce_fn : public replace_visitor {
    }
 };

+std::string sexpr_to_string(sexpr const & s) {
+    if (is_string(s))
+        return to_string(s);
+    std::stringstream ss;
+    ss << s;
+    return ss.str();
+}
+
+// check whether a space must be inserted between the strings so that lexing them would
+// produce separate tokens
+bool pretty_fn::needs_space_sep(std::string const & s1, std::string const & s2) const {
+    if (is_id_rest(get_utf8_last_char(s1.data()), s1.data() + s1.size()) && is_id_rest(s2.data(), s2.data() + s2.size()))
+        return true; // would be lexed as a single identifier without space
+
+
+    // check whether s1 + s2 has a longer prefix in the token table than s1
+    token_table const * t = &m_token_table;
+    for (char c : s1) {
+        t = t->find(c);
+        if (!t)
+            return false; // s1 must be an identifier, and we know s2 does not start with is_id_rest
+    }
+    for (char c : s2) {
+        t = t->find(c);
+        if (!t)
+            return false;
+        if (t->value())
+            return true;
+    }
+    return true; // the next identifier may expand s1 + s2 to a token
+}
+
 format pretty_fn::operator()(expr const & e) {
    m_depth = 0; m_num_steps = 0;
+    result r;
    if (m_beta)
-        return pp_child(purify(pp_beta_reduce_fn()(e)), 0).fmt();
+        r = pp_child(purify(pp_beta_reduce_fn()(e)), 0);
    else
-        return pp_child(purify(e), 0).fmt();
+        r = pp_child(purify(e), 0);
+
+    // insert spaces so that lexing the result round-trips
+    std::function<bool(sexpr const &, sexpr const &)> sep; // NOLINT
+    sep = [&](sexpr const & s1, sexpr const & s2) { return needs_space_sep(sexpr_to_string(s1), sexpr_to_string(s2)); };
+    return r.fmt().separate_tokens(sep);
 }

 formatter_factory mk_pretty_formatter_factory() {
--- a/src/frontends/lean/pp.h
+++ b/src/frontends/lean/pp.h
@ -93,6 +93,7 @@ private:
    optional<result> pp_notation(expr const & e);

    result add_paren_if_needed(result const & r, unsigned bp);
+    bool needs_space_sep(std::string const &s1, std::string const &s2) const;

    result pp_overriden_local_ref(expr const & e);
    bool ignore_local_ref(expr const & e);
--- a/src/frontends/lean/scanner.cpp
+++ b/src/frontends/lean/scanner.cpp
@ -314,10 +314,10 @@ static bool is_id_first(buffer<char> const & cs, unsigned i) {
    return is_letter_like_unicode(u);
 }

-static bool is_id_rest(buffer<char> const & cs, unsigned i)  {
-    if (std::isalnum(cs[i]) || cs[i] == '_' || cs[i] == '\'')
+bool is_id_rest(char const * begin, char const * end) {
+    if (std::isalnum(*begin) || *begin == '_' || *begin == '\'')
        return true;
-    unsigned u = utf8_to_unicode(cs.begin() + i, cs.end());
+    unsigned u = utf8_to_unicode(begin, end);
    return is_letter_like_unicode(u) || is_super_sub_script_alnum_unicode(u);
 }

@ -337,7 +337,7 @@ auto scanner::read_key_cmd_id() -> token_kind {
            unsigned i = id_sz;
            next_utf(cs);
            num_utfs++;
-            if (is_id_rest(cs, i)) {
+            if (is_id_rest(&cs[i], cs.end())) {
            } else if (cs[i] == '.') {
                next_utf(cs);
                num_utfs++;
--- a/src/frontends/lean/scanner.h
+++ b/src/frontends/lean/scanner.h
@ -92,6 +92,7 @@ public:
    };
 };
 std::ostream & operator<<(std::ostream & out, scanner::token_kind k);
+bool is_id_rest(char const * begin, char const * end);
 void initialize_scanner();
 void finalize_scanner();
 }
--- a/src/util/sexpr/format.cpp
+++ b/src/util/sexpr/format.cpp
@ -177,6 +177,64 @@ format wrap(format const & f1, format const & f2) {
    return f1 + choice(format(" "), line()) + f2;
 }

+std::tuple<sexpr, sexpr const *> format::separate_tokens(sexpr const & s, sexpr const * last,
+                                                         std::function<bool(sexpr const &, sexpr const &)> sep // NOLINT
+) const {
+    switch (sexpr_kind(s)) {
+        case format_kind::NIL:
+        case format_kind::LINE:
+        case format_kind::COLOR_BEGIN:
+        case format_kind::COLOR_END:
+            return std::make_tuple(s, last);
+        case format_kind::COMPOSE:
+        case format_kind::FLAT_COMPOSE:
+        {
+            sexpr list = sexpr_compose_list(s);
+            list = map(list, [&](sexpr const & s) {
+                sexpr t;
+                std::tie(t, last) = separate_tokens(s, last, sep);
+                return t;
+            });
+            sexpr t = sexpr_kind(m_value) == format_kind::COMPOSE ? sexpr_compose(list) : sexpr_flat_compose(list);
+            return std::make_tuple(t, last);
+        }
+        case format_kind::NEST:
+        {
+            sexpr t;
+            std::tie(t, last) = separate_tokens(sexpr_nest_s(s), last, sep);
+            return std::make_tuple(sexpr_nest(sexpr_nest_i(s), t), last);
+        }
+        case format_kind::TEXT:
+        {
+            sexpr const & text = sexpr_text_t(s);
+            if (last && sep(*last, text))
+                return std::make_tuple(sexpr_compose({*g_sexpr_space, s}), &text);
+            else
+                return std::make_tuple(s, &text);
+        }
+        case format_kind::CHOICE:
+        {
+            // we assume that choices only differ in spacing and thus share their last token
+            sexpr s1, s2; sexpr const * last1, * last2;
+            std::tie(s1, last1) = separate_tokens(sexpr_choice_1(s), last, sep);
+            std::tie(s2, last2) = separate_tokens(sexpr_choice_2(s), last, sep);
+            lean_assert(last1 == last2 || (last1 && last2 && *last1 == *last2));
+            return std::make_tuple(sexpr_choice(s1, s2), last1);
+        }
+    }
+    lean_unreachable(); // LCOV_EXCL_LINE
+}
+
+/**
+   \brief Replaces every text sepxr \c t with <tt>compose(" ", t)</tt> if there is a preceding
+   text sexpr \c s and <tt>sep(s, t)</tt> is true
+*/
+format format::separate_tokens(
+            std::function<bool(sexpr const &, sexpr const &)> sep // NOLINT
+) const {
+    return format(std::get<0>(separate_tokens(m_value, nullptr, sep)));
+}
+
 /**
   \brief Auxiliary exception used to sign that the amount of
   available space was exhausted. It is used in \c space_upto_line_break and
--- a/src/util/sexpr/format.h
+++ b/src/util/sexpr/format.h
@ -120,6 +120,10 @@ private:
        return sexpr(sexpr(format::format_kind::LINE), sexpr());
    }

+    std::tuple<sexpr, sexpr const *> separate_tokens(sexpr const & s, sexpr const * last,
+                                                     std::function<bool(sexpr const &, sexpr const &)> sep //NOLINT
+    ) const;
+
    // Functions used inside of pretty printing
    static bool space_upto_line_break_list_exceeded(sexpr const & s, int available, std::vector<pair<sexpr, unsigned>> const & todo);
    static int space_upto_line_break(sexpr const & s, int available, bool & found);
@ -169,6 +173,8 @@ public:
    bool is_nil_fmt() const { return kind() == format_kind::NIL; }
    unsigned hash() const { return m_value.hash(); }

+    format separate_tokens(std::function<bool(sexpr const &, sexpr const &)> sep) const; // NOLINT
+
    friend format compose(format const & f1, format const & f2);
    friend format nest(int i, format const & f);
    friend format highlight(format const & f, format::format_color const c);
--- a/src/util/sexpr/sexpr_fn.h
+++ b/src/util/sexpr/sexpr_fn.h
@ -32,7 +32,8 @@ sexpr map(sexpr const & l, F f) {
        return l;
    } else {
        lean_assert(is_cons(l));
-        return sexpr(f(head(l)), map(tail(l), f));
+        auto x = f(head(l)); // force left-to-right evaluation order
+        return sexpr(x, map(tail(l), f));
    }
 }

--- a/src/util/utf8.cpp
+++ b/src/util/utf8.cpp
@ -5,6 +5,7 @@ Released under Apache 2.0 license as described in the file LICENSE.
 Author: Leonardo de Moura
 */
 #include <cstdlib>
+#include "util/debug.h"

 namespace lean {
 bool is_utf8_next(unsigned char c) { return (c & 0xC0) == 0x80; }
@ -37,4 +38,15 @@ size_t utf8_strlen(char const * str) {
    }
    return r;
 }
+
+char const * get_utf8_last_char(char const * str) {
+    char const * r;
+    lean_assert(*str != 0);
+    do {
+        r = str;
+        unsigned sz = get_utf8_size(*str);
+        str += sz;
+    } while (*str != 0);
+    return r;
+}
 }
--- a/src/util/utf8.h
+++ b/src/util/utf8.h
@ -9,4 +9,5 @@ namespace lean {
 bool is_utf8_next(unsigned char c);
 unsigned get_utf8_size(unsigned char c);
 size_t utf8_strlen(char const * str);
+char const * get_utf8_last_char(char const * str);
 }