From 189a300b118bda08a40efb0750ab8adc4c57a164 Mon Sep 17 00:00:00 2001 From: Sebastian Ullrich Date: Mon, 28 Sep 2015 01:17:46 +0200 Subject: [PATCH] feat(frontends/lean): improve pretty printing space insertion heuristic --- src/frontends/lean/pp.cpp | 110 ++++++++++++++++----------------- src/frontends/lean/pp.h | 1 + src/frontends/lean/scanner.cpp | 8 +-- src/frontends/lean/scanner.h | 1 + src/util/sexpr/format.cpp | 58 +++++++++++++++++ src/util/sexpr/format.h | 6 ++ src/util/sexpr/sexpr_fn.h | 3 +- src/util/utf8.cpp | 12 ++++ src/util/utf8.h | 1 + 9 files changed, 140 insertions(+), 60 deletions(-) diff --git a/src/frontends/lean/pp.cpp b/src/frontends/lean/pp.cpp index 77aaf2869..38705b72f 100644 --- a/src/frontends/lean/pp.cpp +++ b/src/frontends/lean/pp.cpp @@ -6,6 +6,8 @@ Author: Leonardo de Moura */ #include #include +#include +#include #include "util/flet.h" #include "kernel/replace_fn.h" #include "kernel/free_vars.h" @@ -36,6 +38,7 @@ Author: Leonardo de Moura #include "frontends/lean/builtin_exprs.h" #include "frontends/lean/parser_config.h" #include "frontends/lean/local_ref_info.h" +#include "frontends/lean/scanner.h" namespace lean { static format * g_ellipsis_n_fmt = nullptr; @@ -996,20 +999,6 @@ auto pretty_fn::pp_notation_child(expr const & e, unsigned lbp, unsigned rbp) -> } } -static bool add_extra_space_first(name const & tk) { - // TODO(Leo): this is a hard-coded temporary solution for deciding whether extra - // spaces should be added or not when pretty printing notation. - // We should implement a better solution in the future. - return tk != "(" && tk != ")" && tk != "["; -} - -static bool add_extra_space(name const & tk) { - // TODO(Leo): this is a hard-coded temporary solution for deciding whether extra - // spaces should be added or not when pretty printing notation. - // We should implement a better solution in the future. - return tk != "," && tk != "(" && tk != ")" && tk != "["; -} - static bool is_atomic_notation(notation_entry const & entry) { if (!entry.is_nud()) return false; @@ -1034,21 +1023,18 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer> unsigned i = ts.size(); unsigned last_rbp = inf_bp()-1; unsigned token_lbp = 0; - bool extra_space = false; - bool last_is_skip = false; bool last = true; while (i > 0) { --i; format curr; notation::action const & a = ts[i].get_action(); name const & tk = ts[i].get_token(); + format tk_fmt = format(tk); switch (a.kind()) { case notation::action_kind::Skip: - curr = format(tk); - if (last) { - last_rbp = inf_bp(); - last_is_skip = true; - } + curr = tk_fmt; + if (last) + last_rbp = inf_bp(); break; case notation::action_kind::Expr: if (args.empty() || !args.back()) { @@ -1057,14 +1043,7 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer> expr e = *args.back(); args.pop_back(); result e_r = pp_notation_child(e, token_lbp, a.rbp()); - format e_fmt = e_r.fmt(); - curr = format(tk); - // we add space after the token only when - // 1- add_extra_space(tk) is true AND - // 2- tk is the first token in a nud notation - if (add_extra_space(tk) && (!entry.is_nud() || i != 0 || m_extra_spaces)) - curr = curr + space(); - curr = curr + e_fmt; + curr = tk_fmt + e_r.fmt(); if (last) last_rbp = a.rbp(); break; @@ -1109,8 +1088,6 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer> unsigned curr_lbp = token_lbp; if (auto t = a.get_terminator()) { curr = format(*t); - if (add_extra_space(*t) && m_extra_spaces) - curr = space() + curr; curr_lbp = get_some_precedence(m_token_table, *t); } unsigned j = rec_args.size(); @@ -1120,15 +1097,10 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer> --j; result arg_res = pp_notation_child(rec_args[j], curr_lbp, a.rbp()); if (j == 0) { - if (add_extra_space_first(tk) && (!entry.is_nud() || i != 0 || m_extra_spaces)) - curr = format(tk) + space() + arg_res.fmt() + curr; - else - curr = format(tk) + arg_res.fmt() + curr; + curr = tk_fmt + arg_res.fmt() + curr; } else { - curr = sep_fmt + space() + arg_res.fmt() + curr; + curr = sep_fmt + arg_res.fmt() + curr; } - if (j > 0 && add_extra_space(a.get_sep())) - curr = space() + curr; curr_lbp = sep_lbp; } break; @@ -1136,12 +1108,10 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer> case notation::action_kind::Binder: if (locals.size() != 1) return optional(); - curr = format(tk) + pp_binder(locals[0]); - if (!last) - curr = curr + space(); + curr = tk_fmt + pp_binder(locals[0]); break; case notation::action_kind::Binders: - curr = format(tk) + pp_binders(locals); + curr = tk_fmt + pp_binders(locals); break; case notation::action_kind::ScopedExpr: if (args.empty() || !args.back()) { @@ -1177,8 +1147,7 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer> if (locals.empty()) return optional(); result e_r = pp_notation_child(e, token_lbp, a.rbp()); - format e_fmt = e_r.fmt(); - curr = format(tk) + space() + e_fmt; + curr = tk_fmt + e_r.fmt(); if (last) last_rbp = a.rbp(); break; @@ -1192,12 +1161,8 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer> fmt = curr; last = false; } else { - if (extra_space) - curr = curr + space(); fmt = curr + fmt; } - if (m_extra_spaces || !last_is_skip) - extra_space = add_extra_space(tk); } unsigned first_lbp = inf_bp(); if (!entry.is_nud()) { @@ -1208,10 +1173,7 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer> expr e = *args.back(); args.pop_back(); format e_fmt = pp_notation_child(e, token_lbp, 0).fmt(); - if (m_extra_spaces || !last_is_skip) - fmt = e_fmt + space() + fmt; - else - fmt = e_fmt + fmt; + fmt = e_fmt + fmt; } return optional(result(first_lbp, last_rbp, fmt)); } @@ -1221,7 +1183,7 @@ auto pretty_fn::pp_notation(expr const & e) -> optional { if (!m_notation || is_var(e)) return optional(); for (notation_entry const & entry : get_notation_entries(m_env, head_index(e))) { - if (entry.group() != notation_entry_group::Main) + if (entry.group() == notation_entry_group::Reserve) continue; if (!m_unicode && !entry.is_safe_ascii()) continue; // ignore this notation declaration since unicode support is not enabled @@ -1338,12 +1300,50 @@ class pp_beta_reduce_fn : public replace_visitor { } }; +std::string sexpr_to_string(sexpr const & s) { + if (is_string(s)) + return to_string(s); + std::stringstream ss; + ss << s; + return ss.str(); +} + +// check whether a space must be inserted between the strings so that lexing them would +// produce separate tokens +bool pretty_fn::needs_space_sep(std::string const & s1, std::string const & s2) const { + if (is_id_rest(get_utf8_last_char(s1.data()), s1.data() + s1.size()) && is_id_rest(s2.data(), s2.data() + s2.size())) + return true; // would be lexed as a single identifier without space + + + // check whether s1 + s2 has a longer prefix in the token table than s1 + token_table const * t = &m_token_table; + for (char c : s1) { + t = t->find(c); + if (!t) + return false; // s1 must be an identifier, and we know s2 does not start with is_id_rest + } + for (char c : s2) { + t = t->find(c); + if (!t) + return false; + if (t->value()) + return true; + } + return true; // the next identifier may expand s1 + s2 to a token +} + format pretty_fn::operator()(expr const & e) { m_depth = 0; m_num_steps = 0; + result r; if (m_beta) - return pp_child(purify(pp_beta_reduce_fn()(e)), 0).fmt(); + r = pp_child(purify(pp_beta_reduce_fn()(e)), 0); else - return pp_child(purify(e), 0).fmt(); + r = pp_child(purify(e), 0); + + // insert spaces so that lexing the result round-trips + std::function sep; // NOLINT + sep = [&](sexpr const & s1, sexpr const & s2) { return needs_space_sep(sexpr_to_string(s1), sexpr_to_string(s2)); }; + return r.fmt().separate_tokens(sep); } formatter_factory mk_pretty_formatter_factory() { diff --git a/src/frontends/lean/pp.h b/src/frontends/lean/pp.h index d87fbf0e1..eb35cf89a 100644 --- a/src/frontends/lean/pp.h +++ b/src/frontends/lean/pp.h @@ -93,6 +93,7 @@ private: optional pp_notation(expr const & e); result add_paren_if_needed(result const & r, unsigned bp); + bool needs_space_sep(std::string const &s1, std::string const &s2) const; result pp_overriden_local_ref(expr const & e); bool ignore_local_ref(expr const & e); diff --git a/src/frontends/lean/scanner.cpp b/src/frontends/lean/scanner.cpp index f6d6f78fe..87ebd3903 100644 --- a/src/frontends/lean/scanner.cpp +++ b/src/frontends/lean/scanner.cpp @@ -314,10 +314,10 @@ static bool is_id_first(buffer const & cs, unsigned i) { return is_letter_like_unicode(u); } -static bool is_id_rest(buffer const & cs, unsigned i) { - if (std::isalnum(cs[i]) || cs[i] == '_' || cs[i] == '\'') +bool is_id_rest(char const * begin, char const * end) { + if (std::isalnum(*begin) || *begin == '_' || *begin == '\'') return true; - unsigned u = utf8_to_unicode(cs.begin() + i, cs.end()); + unsigned u = utf8_to_unicode(begin, end); return is_letter_like_unicode(u) || is_super_sub_script_alnum_unicode(u); } @@ -337,7 +337,7 @@ auto scanner::read_key_cmd_id() -> token_kind { unsigned i = id_sz; next_utf(cs); num_utfs++; - if (is_id_rest(cs, i)) { + if (is_id_rest(&cs[i], cs.end())) { } else if (cs[i] == '.') { next_utf(cs); num_utfs++; diff --git a/src/frontends/lean/scanner.h b/src/frontends/lean/scanner.h index 641a4e281..cdc20267b 100644 --- a/src/frontends/lean/scanner.h +++ b/src/frontends/lean/scanner.h @@ -92,6 +92,7 @@ public: }; }; std::ostream & operator<<(std::ostream & out, scanner::token_kind k); +bool is_id_rest(char const * begin, char const * end); void initialize_scanner(); void finalize_scanner(); } diff --git a/src/util/sexpr/format.cpp b/src/util/sexpr/format.cpp index 95c948aca..200d3cf25 100644 --- a/src/util/sexpr/format.cpp +++ b/src/util/sexpr/format.cpp @@ -177,6 +177,64 @@ format wrap(format const & f1, format const & f2) { return f1 + choice(format(" "), line()) + f2; } +std::tuple format::separate_tokens(sexpr const & s, sexpr const * last, + std::function sep // NOLINT +) const { + switch (sexpr_kind(s)) { + case format_kind::NIL: + case format_kind::LINE: + case format_kind::COLOR_BEGIN: + case format_kind::COLOR_END: + return std::make_tuple(s, last); + case format_kind::COMPOSE: + case format_kind::FLAT_COMPOSE: + { + sexpr list = sexpr_compose_list(s); + list = map(list, [&](sexpr const & s) { + sexpr t; + std::tie(t, last) = separate_tokens(s, last, sep); + return t; + }); + sexpr t = sexpr_kind(m_value) == format_kind::COMPOSE ? sexpr_compose(list) : sexpr_flat_compose(list); + return std::make_tuple(t, last); + } + case format_kind::NEST: + { + sexpr t; + std::tie(t, last) = separate_tokens(sexpr_nest_s(s), last, sep); + return std::make_tuple(sexpr_nest(sexpr_nest_i(s), t), last); + } + case format_kind::TEXT: + { + sexpr const & text = sexpr_text_t(s); + if (last && sep(*last, text)) + return std::make_tuple(sexpr_compose({*g_sexpr_space, s}), &text); + else + return std::make_tuple(s, &text); + } + case format_kind::CHOICE: + { + // we assume that choices only differ in spacing and thus share their last token + sexpr s1, s2; sexpr const * last1, * last2; + std::tie(s1, last1) = separate_tokens(sexpr_choice_1(s), last, sep); + std::tie(s2, last2) = separate_tokens(sexpr_choice_2(s), last, sep); + lean_assert(last1 == last2 || (last1 && last2 && *last1 == *last2)); + return std::make_tuple(sexpr_choice(s1, s2), last1); + } + } + lean_unreachable(); // LCOV_EXCL_LINE +} + +/** + \brief Replaces every text sepxr \c t with compose(" ", t) if there is a preceding + text sexpr \c s and sep(s, t) is true +*/ +format format::separate_tokens( + std::function sep // NOLINT +) const { + return format(std::get<0>(separate_tokens(m_value, nullptr, sep))); +} + /** \brief Auxiliary exception used to sign that the amount of available space was exhausted. It is used in \c space_upto_line_break and diff --git a/src/util/sexpr/format.h b/src/util/sexpr/format.h index fb86fce40..fffc0587f 100644 --- a/src/util/sexpr/format.h +++ b/src/util/sexpr/format.h @@ -120,6 +120,10 @@ private: return sexpr(sexpr(format::format_kind::LINE), sexpr()); } + std::tuple separate_tokens(sexpr const & s, sexpr const * last, + std::function sep //NOLINT + ) const; + // Functions used inside of pretty printing static bool space_upto_line_break_list_exceeded(sexpr const & s, int available, std::vector> const & todo); static int space_upto_line_break(sexpr const & s, int available, bool & found); @@ -169,6 +173,8 @@ public: bool is_nil_fmt() const { return kind() == format_kind::NIL; } unsigned hash() const { return m_value.hash(); } + format separate_tokens(std::function sep) const; // NOLINT + friend format compose(format const & f1, format const & f2); friend format nest(int i, format const & f); friend format highlight(format const & f, format::format_color const c); diff --git a/src/util/sexpr/sexpr_fn.h b/src/util/sexpr/sexpr_fn.h index 15e6de8bf..9460a11a1 100644 --- a/src/util/sexpr/sexpr_fn.h +++ b/src/util/sexpr/sexpr_fn.h @@ -32,7 +32,8 @@ sexpr map(sexpr const & l, F f) { return l; } else { lean_assert(is_cons(l)); - return sexpr(f(head(l)), map(tail(l), f)); + auto x = f(head(l)); // force left-to-right evaluation order + return sexpr(x, map(tail(l), f)); } } diff --git a/src/util/utf8.cpp b/src/util/utf8.cpp index 11dd3e66b..5cbd8d9af 100644 --- a/src/util/utf8.cpp +++ b/src/util/utf8.cpp @@ -5,6 +5,7 @@ Released under Apache 2.0 license as described in the file LICENSE. Author: Leonardo de Moura */ #include +#include "util/debug.h" namespace lean { bool is_utf8_next(unsigned char c) { return (c & 0xC0) == 0x80; } @@ -37,4 +38,15 @@ size_t utf8_strlen(char const * str) { } return r; } + +char const * get_utf8_last_char(char const * str) { + char const * r; + lean_assert(*str != 0); + do { + r = str; + unsigned sz = get_utf8_size(*str); + str += sz; + } while (*str != 0); + return r; +} } diff --git a/src/util/utf8.h b/src/util/utf8.h index 3641952cf..75b2d4854 100644 --- a/src/util/utf8.h +++ b/src/util/utf8.h @@ -9,4 +9,5 @@ namespace lean { bool is_utf8_next(unsigned char c); unsigned get_utf8_size(unsigned char c); size_t utf8_strlen(char const * str); +char const * get_utf8_last_char(char const * str); }