feat(frontends/lean): improve pretty printing space insertion heuristic
This commit is contained in:
parent
8fd8ff2773
commit
189a300b11
9 changed files with 140 additions and 60 deletions
|
@ -6,6 +6,8 @@ Author: Leonardo de Moura
|
|||
*/
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
#include <string>
|
||||
#include <util/utf8.h>
|
||||
#include "util/flet.h"
|
||||
#include "kernel/replace_fn.h"
|
||||
#include "kernel/free_vars.h"
|
||||
|
@ -36,6 +38,7 @@ Author: Leonardo de Moura
|
|||
#include "frontends/lean/builtin_exprs.h"
|
||||
#include "frontends/lean/parser_config.h"
|
||||
#include "frontends/lean/local_ref_info.h"
|
||||
#include "frontends/lean/scanner.h"
|
||||
|
||||
namespace lean {
|
||||
static format * g_ellipsis_n_fmt = nullptr;
|
||||
|
@ -996,20 +999,6 @@ auto pretty_fn::pp_notation_child(expr const & e, unsigned lbp, unsigned rbp) ->
|
|||
}
|
||||
}
|
||||
|
||||
static bool add_extra_space_first(name const & tk) {
|
||||
// TODO(Leo): this is a hard-coded temporary solution for deciding whether extra
|
||||
// spaces should be added or not when pretty printing notation.
|
||||
// We should implement a better solution in the future.
|
||||
return tk != "(" && tk != ")" && tk != "[";
|
||||
}
|
||||
|
||||
static bool add_extra_space(name const & tk) {
|
||||
// TODO(Leo): this is a hard-coded temporary solution for deciding whether extra
|
||||
// spaces should be added or not when pretty printing notation.
|
||||
// We should implement a better solution in the future.
|
||||
return tk != "," && tk != "(" && tk != ")" && tk != "[";
|
||||
}
|
||||
|
||||
static bool is_atomic_notation(notation_entry const & entry) {
|
||||
if (!entry.is_nud())
|
||||
return false;
|
||||
|
@ -1034,21 +1023,18 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer<optional<expr>>
|
|||
unsigned i = ts.size();
|
||||
unsigned last_rbp = inf_bp()-1;
|
||||
unsigned token_lbp = 0;
|
||||
bool extra_space = false;
|
||||
bool last_is_skip = false;
|
||||
bool last = true;
|
||||
while (i > 0) {
|
||||
--i;
|
||||
format curr;
|
||||
notation::action const & a = ts[i].get_action();
|
||||
name const & tk = ts[i].get_token();
|
||||
format tk_fmt = format(tk);
|
||||
switch (a.kind()) {
|
||||
case notation::action_kind::Skip:
|
||||
curr = format(tk);
|
||||
if (last) {
|
||||
last_rbp = inf_bp();
|
||||
last_is_skip = true;
|
||||
}
|
||||
curr = tk_fmt;
|
||||
if (last)
|
||||
last_rbp = inf_bp();
|
||||
break;
|
||||
case notation::action_kind::Expr:
|
||||
if (args.empty() || !args.back()) {
|
||||
|
@ -1057,14 +1043,7 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer<optional<expr>>
|
|||
expr e = *args.back();
|
||||
args.pop_back();
|
||||
result e_r = pp_notation_child(e, token_lbp, a.rbp());
|
||||
format e_fmt = e_r.fmt();
|
||||
curr = format(tk);
|
||||
// we add space after the token only when
|
||||
// 1- add_extra_space(tk) is true AND
|
||||
// 2- tk is the first token in a nud notation
|
||||
if (add_extra_space(tk) && (!entry.is_nud() || i != 0 || m_extra_spaces))
|
||||
curr = curr + space();
|
||||
curr = curr + e_fmt;
|
||||
curr = tk_fmt + e_r.fmt();
|
||||
if (last)
|
||||
last_rbp = a.rbp();
|
||||
break;
|
||||
|
@ -1109,8 +1088,6 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer<optional<expr>>
|
|||
unsigned curr_lbp = token_lbp;
|
||||
if (auto t = a.get_terminator()) {
|
||||
curr = format(*t);
|
||||
if (add_extra_space(*t) && m_extra_spaces)
|
||||
curr = space() + curr;
|
||||
curr_lbp = get_some_precedence(m_token_table, *t);
|
||||
}
|
||||
unsigned j = rec_args.size();
|
||||
|
@ -1120,15 +1097,10 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer<optional<expr>>
|
|||
--j;
|
||||
result arg_res = pp_notation_child(rec_args[j], curr_lbp, a.rbp());
|
||||
if (j == 0) {
|
||||
if (add_extra_space_first(tk) && (!entry.is_nud() || i != 0 || m_extra_spaces))
|
||||
curr = format(tk) + space() + arg_res.fmt() + curr;
|
||||
else
|
||||
curr = format(tk) + arg_res.fmt() + curr;
|
||||
curr = tk_fmt + arg_res.fmt() + curr;
|
||||
} else {
|
||||
curr = sep_fmt + space() + arg_res.fmt() + curr;
|
||||
curr = sep_fmt + arg_res.fmt() + curr;
|
||||
}
|
||||
if (j > 0 && add_extra_space(a.get_sep()))
|
||||
curr = space() + curr;
|
||||
curr_lbp = sep_lbp;
|
||||
}
|
||||
break;
|
||||
|
@ -1136,12 +1108,10 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer<optional<expr>>
|
|||
case notation::action_kind::Binder:
|
||||
if (locals.size() != 1)
|
||||
return optional<result>();
|
||||
curr = format(tk) + pp_binder(locals[0]);
|
||||
if (!last)
|
||||
curr = curr + space();
|
||||
curr = tk_fmt + pp_binder(locals[0]);
|
||||
break;
|
||||
case notation::action_kind::Binders:
|
||||
curr = format(tk) + pp_binders(locals);
|
||||
curr = tk_fmt + pp_binders(locals);
|
||||
break;
|
||||
case notation::action_kind::ScopedExpr:
|
||||
if (args.empty() || !args.back()) {
|
||||
|
@ -1177,8 +1147,7 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer<optional<expr>>
|
|||
if (locals.empty())
|
||||
return optional<result>();
|
||||
result e_r = pp_notation_child(e, token_lbp, a.rbp());
|
||||
format e_fmt = e_r.fmt();
|
||||
curr = format(tk) + space() + e_fmt;
|
||||
curr = tk_fmt + e_r.fmt();
|
||||
if (last)
|
||||
last_rbp = a.rbp();
|
||||
break;
|
||||
|
@ -1192,12 +1161,8 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer<optional<expr>>
|
|||
fmt = curr;
|
||||
last = false;
|
||||
} else {
|
||||
if (extra_space)
|
||||
curr = curr + space();
|
||||
fmt = curr + fmt;
|
||||
}
|
||||
if (m_extra_spaces || !last_is_skip)
|
||||
extra_space = add_extra_space(tk);
|
||||
}
|
||||
unsigned first_lbp = inf_bp();
|
||||
if (!entry.is_nud()) {
|
||||
|
@ -1208,10 +1173,7 @@ auto pretty_fn::pp_notation(notation_entry const & entry, buffer<optional<expr>>
|
|||
expr e = *args.back();
|
||||
args.pop_back();
|
||||
format e_fmt = pp_notation_child(e, token_lbp, 0).fmt();
|
||||
if (m_extra_spaces || !last_is_skip)
|
||||
fmt = e_fmt + space() + fmt;
|
||||
else
|
||||
fmt = e_fmt + fmt;
|
||||
fmt = e_fmt + fmt;
|
||||
}
|
||||
return optional<result>(result(first_lbp, last_rbp, fmt));
|
||||
}
|
||||
|
@ -1221,7 +1183,7 @@ auto pretty_fn::pp_notation(expr const & e) -> optional<result> {
|
|||
if (!m_notation || is_var(e))
|
||||
return optional<result>();
|
||||
for (notation_entry const & entry : get_notation_entries(m_env, head_index(e))) {
|
||||
if (entry.group() != notation_entry_group::Main)
|
||||
if (entry.group() == notation_entry_group::Reserve)
|
||||
continue;
|
||||
if (!m_unicode && !entry.is_safe_ascii())
|
||||
continue; // ignore this notation declaration since unicode support is not enabled
|
||||
|
@ -1338,12 +1300,50 @@ class pp_beta_reduce_fn : public replace_visitor {
|
|||
}
|
||||
};
|
||||
|
||||
std::string sexpr_to_string(sexpr const & s) {
|
||||
if (is_string(s))
|
||||
return to_string(s);
|
||||
std::stringstream ss;
|
||||
ss << s;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
// check whether a space must be inserted between the strings so that lexing them would
|
||||
// produce separate tokens
|
||||
bool pretty_fn::needs_space_sep(std::string const & s1, std::string const & s2) const {
|
||||
if (is_id_rest(get_utf8_last_char(s1.data()), s1.data() + s1.size()) && is_id_rest(s2.data(), s2.data() + s2.size()))
|
||||
return true; // would be lexed as a single identifier without space
|
||||
|
||||
|
||||
// check whether s1 + s2 has a longer prefix in the token table than s1
|
||||
token_table const * t = &m_token_table;
|
||||
for (char c : s1) {
|
||||
t = t->find(c);
|
||||
if (!t)
|
||||
return false; // s1 must be an identifier, and we know s2 does not start with is_id_rest
|
||||
}
|
||||
for (char c : s2) {
|
||||
t = t->find(c);
|
||||
if (!t)
|
||||
return false;
|
||||
if (t->value())
|
||||
return true;
|
||||
}
|
||||
return true; // the next identifier may expand s1 + s2 to a token
|
||||
}
|
||||
|
||||
format pretty_fn::operator()(expr const & e) {
|
||||
m_depth = 0; m_num_steps = 0;
|
||||
result r;
|
||||
if (m_beta)
|
||||
return pp_child(purify(pp_beta_reduce_fn()(e)), 0).fmt();
|
||||
r = pp_child(purify(pp_beta_reduce_fn()(e)), 0);
|
||||
else
|
||||
return pp_child(purify(e), 0).fmt();
|
||||
r = pp_child(purify(e), 0);
|
||||
|
||||
// insert spaces so that lexing the result round-trips
|
||||
std::function<bool(sexpr const &, sexpr const &)> sep; // NOLINT
|
||||
sep = [&](sexpr const & s1, sexpr const & s2) { return needs_space_sep(sexpr_to_string(s1), sexpr_to_string(s2)); };
|
||||
return r.fmt().separate_tokens(sep);
|
||||
}
|
||||
|
||||
formatter_factory mk_pretty_formatter_factory() {
|
||||
|
|
|
@ -93,6 +93,7 @@ private:
|
|||
optional<result> pp_notation(expr const & e);
|
||||
|
||||
result add_paren_if_needed(result const & r, unsigned bp);
|
||||
bool needs_space_sep(std::string const &s1, std::string const &s2) const;
|
||||
|
||||
result pp_overriden_local_ref(expr const & e);
|
||||
bool ignore_local_ref(expr const & e);
|
||||
|
|
|
@ -314,10 +314,10 @@ static bool is_id_first(buffer<char> const & cs, unsigned i) {
|
|||
return is_letter_like_unicode(u);
|
||||
}
|
||||
|
||||
static bool is_id_rest(buffer<char> const & cs, unsigned i) {
|
||||
if (std::isalnum(cs[i]) || cs[i] == '_' || cs[i] == '\'')
|
||||
bool is_id_rest(char const * begin, char const * end) {
|
||||
if (std::isalnum(*begin) || *begin == '_' || *begin == '\'')
|
||||
return true;
|
||||
unsigned u = utf8_to_unicode(cs.begin() + i, cs.end());
|
||||
unsigned u = utf8_to_unicode(begin, end);
|
||||
return is_letter_like_unicode(u) || is_super_sub_script_alnum_unicode(u);
|
||||
}
|
||||
|
||||
|
@ -337,7 +337,7 @@ auto scanner::read_key_cmd_id() -> token_kind {
|
|||
unsigned i = id_sz;
|
||||
next_utf(cs);
|
||||
num_utfs++;
|
||||
if (is_id_rest(cs, i)) {
|
||||
if (is_id_rest(&cs[i], cs.end())) {
|
||||
} else if (cs[i] == '.') {
|
||||
next_utf(cs);
|
||||
num_utfs++;
|
||||
|
|
|
@ -92,6 +92,7 @@ public:
|
|||
};
|
||||
};
|
||||
std::ostream & operator<<(std::ostream & out, scanner::token_kind k);
|
||||
bool is_id_rest(char const * begin, char const * end);
|
||||
void initialize_scanner();
|
||||
void finalize_scanner();
|
||||
}
|
||||
|
|
|
@ -177,6 +177,64 @@ format wrap(format const & f1, format const & f2) {
|
|||
return f1 + choice(format(" "), line()) + f2;
|
||||
}
|
||||
|
||||
std::tuple<sexpr, sexpr const *> format::separate_tokens(sexpr const & s, sexpr const * last,
|
||||
std::function<bool(sexpr const &, sexpr const &)> sep // NOLINT
|
||||
) const {
|
||||
switch (sexpr_kind(s)) {
|
||||
case format_kind::NIL:
|
||||
case format_kind::LINE:
|
||||
case format_kind::COLOR_BEGIN:
|
||||
case format_kind::COLOR_END:
|
||||
return std::make_tuple(s, last);
|
||||
case format_kind::COMPOSE:
|
||||
case format_kind::FLAT_COMPOSE:
|
||||
{
|
||||
sexpr list = sexpr_compose_list(s);
|
||||
list = map(list, [&](sexpr const & s) {
|
||||
sexpr t;
|
||||
std::tie(t, last) = separate_tokens(s, last, sep);
|
||||
return t;
|
||||
});
|
||||
sexpr t = sexpr_kind(m_value) == format_kind::COMPOSE ? sexpr_compose(list) : sexpr_flat_compose(list);
|
||||
return std::make_tuple(t, last);
|
||||
}
|
||||
case format_kind::NEST:
|
||||
{
|
||||
sexpr t;
|
||||
std::tie(t, last) = separate_tokens(sexpr_nest_s(s), last, sep);
|
||||
return std::make_tuple(sexpr_nest(sexpr_nest_i(s), t), last);
|
||||
}
|
||||
case format_kind::TEXT:
|
||||
{
|
||||
sexpr const & text = sexpr_text_t(s);
|
||||
if (last && sep(*last, text))
|
||||
return std::make_tuple(sexpr_compose({*g_sexpr_space, s}), &text);
|
||||
else
|
||||
return std::make_tuple(s, &text);
|
||||
}
|
||||
case format_kind::CHOICE:
|
||||
{
|
||||
// we assume that choices only differ in spacing and thus share their last token
|
||||
sexpr s1, s2; sexpr const * last1, * last2;
|
||||
std::tie(s1, last1) = separate_tokens(sexpr_choice_1(s), last, sep);
|
||||
std::tie(s2, last2) = separate_tokens(sexpr_choice_2(s), last, sep);
|
||||
lean_assert(last1 == last2 || (last1 && last2 && *last1 == *last2));
|
||||
return std::make_tuple(sexpr_choice(s1, s2), last1);
|
||||
}
|
||||
}
|
||||
lean_unreachable(); // LCOV_EXCL_LINE
|
||||
}
|
||||
|
||||
/**
|
||||
\brief Replaces every text sepxr \c t with <tt>compose(" ", t)</tt> if there is a preceding
|
||||
text sexpr \c s and <tt>sep(s, t)</tt> is true
|
||||
*/
|
||||
format format::separate_tokens(
|
||||
std::function<bool(sexpr const &, sexpr const &)> sep // NOLINT
|
||||
) const {
|
||||
return format(std::get<0>(separate_tokens(m_value, nullptr, sep)));
|
||||
}
|
||||
|
||||
/**
|
||||
\brief Auxiliary exception used to sign that the amount of
|
||||
available space was exhausted. It is used in \c space_upto_line_break and
|
||||
|
|
|
@ -120,6 +120,10 @@ private:
|
|||
return sexpr(sexpr(format::format_kind::LINE), sexpr());
|
||||
}
|
||||
|
||||
std::tuple<sexpr, sexpr const *> separate_tokens(sexpr const & s, sexpr const * last,
|
||||
std::function<bool(sexpr const &, sexpr const &)> sep //NOLINT
|
||||
) const;
|
||||
|
||||
// Functions used inside of pretty printing
|
||||
static bool space_upto_line_break_list_exceeded(sexpr const & s, int available, std::vector<pair<sexpr, unsigned>> const & todo);
|
||||
static int space_upto_line_break(sexpr const & s, int available, bool & found);
|
||||
|
@ -169,6 +173,8 @@ public:
|
|||
bool is_nil_fmt() const { return kind() == format_kind::NIL; }
|
||||
unsigned hash() const { return m_value.hash(); }
|
||||
|
||||
format separate_tokens(std::function<bool(sexpr const &, sexpr const &)> sep) const; // NOLINT
|
||||
|
||||
friend format compose(format const & f1, format const & f2);
|
||||
friend format nest(int i, format const & f);
|
||||
friend format highlight(format const & f, format::format_color const c);
|
||||
|
|
|
@ -32,7 +32,8 @@ sexpr map(sexpr const & l, F f) {
|
|||
return l;
|
||||
} else {
|
||||
lean_assert(is_cons(l));
|
||||
return sexpr(f(head(l)), map(tail(l), f));
|
||||
auto x = f(head(l)); // force left-to-right evaluation order
|
||||
return sexpr(x, map(tail(l), f));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -5,6 +5,7 @@ Released under Apache 2.0 license as described in the file LICENSE.
|
|||
Author: Leonardo de Moura
|
||||
*/
|
||||
#include <cstdlib>
|
||||
#include "util/debug.h"
|
||||
|
||||
namespace lean {
|
||||
bool is_utf8_next(unsigned char c) { return (c & 0xC0) == 0x80; }
|
||||
|
@ -37,4 +38,15 @@ size_t utf8_strlen(char const * str) {
|
|||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
char const * get_utf8_last_char(char const * str) {
|
||||
char const * r;
|
||||
lean_assert(*str != 0);
|
||||
do {
|
||||
r = str;
|
||||
unsigned sz = get_utf8_size(*str);
|
||||
str += sz;
|
||||
} while (*str != 0);
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,4 +9,5 @@ namespace lean {
|
|||
bool is_utf8_next(unsigned char c);
|
||||
unsigned get_utf8_size(unsigned char c);
|
||||
size_t utf8_strlen(char const * str);
|
||||
char const * get_utf8_last_char(char const * str);
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue