"""PrettyConllu writer with aligned columns for plain-text, HTML and TeX/PDF.
The writer prints the 10 CoNLL-U columns (or their configurable subset) in
aligned columns. Column widths are configured by ``w_*`` parameters, but for
each sentence the effective width is shrunk to the longest value needed in
that sentence.
The main parameter is ``format`` which can be set to ``plain``, ``html`` or ``tex``.
``plain`` is the default and produces a plain text.
``html`` produces HTML output with tooltips.
``tex`` produces TeX/PDF output, one sentence per page (using \documentclass{standalone}).
The ``color`` parameter controls colorization in the output.
The default value is ``auto`` which means that for ``plain`` format,
colors are enabled only when writing to a TTY, while for ``html`` and ``tex`` formats,
colors are always enabled. Colors can be forced on or off with values ``1`` or ``0``.
Example CLI usage::
# Plain text (default), compact per-sentence widths, no column names.
udapy write.PrettyConllu < file.conllu
# Plain text with custom widths and visible column names.
udapy write.PrettyConllu print_column_names=1 w_form=20 w_feats=32 < file.conllu
# Force color even if not writing to a TTY
udapy write.PrettyConllu color=1 < file.conllu | less -R
# The same as above, using a udapy syntactic sugar
udapy -P < file.conllu | less -R
# HTML output with tooltips.
udapy write.PrettyConllu format=html < file.conllu > pretty.html
# TeX/PDF output, one sentence per page (using \documentclass{standalone}).
udapy write.PrettyConllu format=tex tex_style=standalone < file.conllu > pretty.tex
pdflatex pretty.tex
"""
import os
import re
import sys
import textwrap
from html import escape
import colorama
from termcolor import colored
from udapi.block.write.conllu import Conllu
ATTRS_ALL = ('ord', 'form', 'lemma', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc')
ANSI_COLOR_OF = {
'ord': 'green',
'form': 'yellow',
'lemma': 'cyan',
'upos': 'red',
'xpos': 'white',
'feats': 'magenta',
'head': 'green',
'deprel': 'blue',
'deps': 'cyan',
'misc': 'magenta',
}
HTML_COLOR_OF = {
'ord': '#226622',
'form': '#7a3e00',
'lemma': '#5a2b8f',
'upos': '#b00020',
'xpos': '#4c4c4c',
'feats': '#7a1ea1',
'head': '#116611',
'deprel': '#0a4fb5',
'deps': '#0a7b7b',
'misc': '#b24800',
}
TEX_COLOR_OF = {
'ord': 'PrettyOrd',
'form': 'PrettyForm',
'lemma': 'PrettyLemma',
'upos': 'PrettyUpos',
'xpos': 'PrettyXpos',
'feats': 'PrettyFeats',
'head': 'PrettyHead',
'deprel': 'PrettyDeprel',
'deps': 'PrettyDeps',
'misc': 'PrettyMisc',
}
[docs]
class PrettyConllu(Conllu):
"""A writer of aligned CoNLL-U tables in plain, TeX and HTML formats."""
def __init__(self, print_sent_id=True, print_text=True, print_empty_trees=True,
format='plain', attributes='ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc',
w_ord=4, w_form=16, w_lemma=16, w_upos=8, w_xpos=10,
w_feats=28, w_head=6, w_deprel=16, w_deps=20, w_misc=28,
color='auto',
print_column_names=False,
tex_style='resize',
tooltip=True, tooltip_feats_misc=True,
mark='(ToDo|ToDoOrigText|Bug|Mark)', marked_only=False, **kwargs):
"""Create a new PrettyConllu writer.
Args:
print_sent_id: Print ``# sent_id = ...`` comments.
print_text: Print ``# text = ...`` comments.
print_empty_trees: Keep Conllu behavior for empty trees.
format: Output format: ``plain``, ``tex`` or ``html``.
attributes: Comma-separated list of displayed columns.
w_ord: Max width for ``ord``.
w_form: Max width for ``form``.
w_lemma: Max width for ``lemma``.
w_upos: Max width for ``upos``.
w_xpos: Max width for ``xpos``.
w_feats: Max width for ``feats``.
w_head: Max width for ``head``.
w_deprel: Max width for ``deprel``.
w_deps: Max width for ``deps``.
w_misc: Max width for ``misc``.
color: Color mode ``auto``, ``1`` or ``0``.
For ``plain``, ``auto`` enables colors only when writing to a TTY.
For ``html`` and ``tex``, ``auto`` is interpreted as ``1``.
print_column_names: Print column header row.
tex_style: TeX rendering style:
``resize`` (default) shrinks too-wide tables to ``\textwidth``,
``standalone`` uses ``standalone`` class and puts each sentence
on a new cropped page,
``overflow`` keeps natural width even if it overflows page.
tooltip: Enable tooltip generation.
tooltip_feats_misc: Show multiline tooltip for FEATS/MISC values with ``|``.
mark: Regex for marked rows (same semantics as TextModeTrees).
marked_only: Print only trees containing one or more marked nodes/comments.
"""
super().__init__(print_sent_id=print_sent_id, print_text=print_text,
print_empty_trees=print_empty_trees, **kwargs)
self.format = format
self.attrs = [a.strip() for a in attributes.split(',') if a.strip()]
unknown = [a for a in self.attrs if a not in ATTRS_ALL]
if unknown:
raise ValueError('Unknown attributes in PrettyConllu: %s' % ', '.join(unknown))
self.width_of = {
'ord': int(w_ord),
'form': int(w_form),
'lemma': int(w_lemma),
'upos': int(w_upos),
'xpos': int(w_xpos),
'feats': int(w_feats),
'head': int(w_head),
'deprel': int(w_deprel),
'deps': int(w_deps),
'misc': int(w_misc),
}
self.tooltip = tooltip
self.tooltip_feats_misc = tooltip_feats_misc
self.color = color
self._color_enabled = False
self.print_column_names = print_column_names
self.tex_style = tex_style
self.marked_only = marked_only
self.mark_re = re.compile(mark + '=') if mark else None
self.comment_mark_re = re.compile(r'^\s*%s\s*=' % mark, re.M) if mark else None
self._tex_sentence_count = 0
def _collect_comment_lines(self, tree):
"""Collect comment lines (without leading #) for the current tree."""
return list(self.iter_comment_lines(tree))
[docs]
def should_print_tree(self, tree, nodes):
"""Should this tree be printed?"""
if not self.marked_only:
return True
if any(self.is_marked(node) for node in nodes):
return True
if self.comment_mark_re is None:
return False
comment_lines = self._collect_comment_lines(tree)
if not comment_lines:
return False
comment_text = '\n'.join(comment_lines)
return self.comment_mark_re.search(comment_text)
def _print_comments_plain(self, tree):
for line in self._collect_comment_lines(tree):
print('#' + line)
[docs]
def before_process_document(self, document):
"""Initialize output wrappers and format-specific state."""
super().before_process_document(document)
if self.format == 'text':
self.format = 'plain'
if isinstance(self.color, str):
color_mode = self.color.lower()
elif isinstance(self.color, bool):
color_mode = '1' if self.color else '0'
else:
color_mode = str(self.color)
if color_mode == 'auto':
self._color_enabled = sys.stdout.isatty() if self.format == 'plain' else True
elif color_mode in ('1', 'true', 'yes', 'on'):
self._color_enabled = True
elif color_mode in ('0', 'false', 'no', 'off'):
self._color_enabled = False
else:
raise ValueError("color must be one of: auto, 1, 0")
if self.format == 'plain' and self._color_enabled:
colorama.just_fix_windows_console()
os.environ['FORCE_COLOR'] = '1'
elif self.format == 'html':
self._print_html_header()
elif self.format == 'tex':
if self.tex_style not in ('resize', 'standalone', 'overflow'):
raise ValueError("tex_style must be one of: resize, standalone, overflow")
self._tex_sentence_count = 0
self._print_tex_header()
elif self.format != 'plain':
raise ValueError("format must be one of: plain, tex, html")
[docs]
def after_process_document(self, document):
"""Finalize output wrappers for html/tex formats."""
if self.format == 'html':
print('<pre class="sentence-gap">')
print('')
print('</pre>')
print('</body>')
print('</html>')
elif self.format == 'tex':
print('\\end{document}')
super().after_process_document(document)
[docs]
def process_tree(self, tree):
"""Render one tree in the selected output format."""
nodes = tree.descendants_and_empty
if not nodes and not self.print_empty_trees:
return
if not self.should_print_tree(tree, nodes):
return
in_standalone_tex = self.format == 'tex' and self.tex_style == 'standalone'
if in_standalone_tex:
print('\\begin{mypage}')
tex_comment_lines = []
if self.format == 'plain':
self._print_comments_plain(tree)
elif self.format == 'html':
self._print_comments_html(tree)
else:
tex_comment_lines = self._collect_comment_lines(tree)
rows = self._build_rows(tree, nodes)
widths = self._effective_widths(rows)
if self.format == 'plain':
if self._color_enabled:
self._render_ansi(rows, widths)
else:
self._render_plain(rows, widths)
print('')
elif self.format == 'html':
self._render_html(rows, widths)
else:
self._render_tex(rows, widths, tex_comment_lines)
if in_standalone_tex:
print('\\end{mypage}')
def _effective_widths(self, rows):
"""Compute per-sentence effective widths from configured maxima."""
widths = {}
for attr in self.attrs:
max_cfg = self.width_of[attr]
if max_cfg <= 0:
widths[attr] = 0
continue
max_len = 0
for row in rows:
text = '_' if row.get(attr) is None else str(row.get(attr))
shown_len = min(len(text), max_cfg)
if shown_len > max_len:
max_len = shown_len
if self.print_column_names:
max_len = max(max_len, min(len(attr.upper()), max_cfg))
widths[attr] = min(max_cfg, max_len)
return widths
def _build_rows(self, tree, nodes):
rows = []
last_mwt_id = 0
for node in nodes:
mwt = node._mwt
if mwt and node._ord > last_mwt_id:
rows.append({
'ord': mwt.ord_range,
'form': '_' if mwt.form is None else mwt.form,
'lemma': '_',
'upos': '_',
'xpos': '_',
'feats': '_' if mwt._feats is None else str(mwt.feats),
'head': '_',
'deprel': '_',
'deps': '_',
'misc': '_' if mwt._misc is None else str(mwt.misc),
'_is_marked': False,
'_row_type': 'mwt',
})
last_mwt_id = mwt.words[-1]._ord
if node._parent is None:
head = '_' # Empty nodes
else:
try:
head = str(node._parent._ord)
except AttributeError:
head = '0'
rows.append({
'ord': str(node._ord),
'form': node.form,
'lemma': node.lemma,
'upos': node.upos,
'xpos': node.xpos,
'feats': '_' if node._feats is None else str(node.feats),
'head': head,
'deprel': node.deprel,
'deps': node.raw_deps,
'misc': '_' if node._misc is None else str(node.misc),
'_is_marked': self.is_marked(node),
'_row_type': 'empty' if node._parent is None else 'token',
})
if not tree._descendants:
rows.append({
'ord': '1',
'form': '_',
'lemma': '_',
'upos': '_',
'xpos': '_',
'feats': '_',
'head': '0',
'deprel': '_',
'deps': '_',
'misc': 'Empty=Yes',
'_is_marked': False,
'_row_type': 'artificial',
})
return rows
def _render_plain(self, rows, widths):
padded = {attr: w + 1 for attr, w in widths.items()}
if self.print_column_names:
print(self._header_line(padded))
print(self._separator_line(padded))
for row in rows:
print(self._row_line(row, padded))
def _render_ansi(self, rows, widths):
padded = {attr: w + 1 for attr, w in widths.items()}
if self.print_column_names:
print(self._header_line(padded, colorize=True))
print(self._separator_line(padded))
for row in rows:
print(self._row_line(row, padded, colorize=True))
def _render_html(self, rows, widths):
print('<table class="prettyconllu">')
if self.print_column_names:
print(' <thead><tr>%s</tr></thead>' % ''.join(
'<th class="%s">%s</th>' % (attr, escape(attr.upper())) for attr in self.attrs))
print(' <tbody>')
for row in rows:
row_class = row['_row_type'] + (' marked' if row['_is_marked'] else '')
print(' <tr class="%s">' % row_class)
for attr in self.attrs:
full_text = '_' if row.get(attr) is None else str(row.get(attr))
display, _, tip = self._fit_value(attr, row.get(attr), widths[attr], pad=False)
value_html = escape(display)
title_attr = ''
if tip is not None:
tip_html = escape(tip).replace('\n', ' ')
title_attr = ' title="%s"' % tip_html
if display != full_text:
full_html = escape(full_text)
value_html = '<span class="copy-full"%s>%s</span><span class="display-short">%s</span>' % (
title_attr, full_html, value_html)
elif title_attr:
value_html = '<span%s>%s</span>' % (title_attr, value_html)
print(' <td class="%s">%s</td>' % (attr, value_html))
print(' </tr>')
print(' </tbody>')
print('</table>')
# Keep one copyable blank line between sentences in browser text copy.
print('<pre class="sentence-gap">')
print('')
print('</pre>')
def _render_tex(self, rows, widths, comment_lines=None):
self._tex_sentence_count += 1
spec = ''.join(['r' if attr == 'head' else 'l' for attr in self.attrs])
print('\\begingroup')
print('\\small')
print('\\noindent')
if self.tex_style == 'resize':
print('\\begin{adjustbox}{max width=\\textwidth}')
print('\\begin{tabular}{%s}' % spec)
if comment_lines:
colspan = len(self.attrs)
wrap_width = self._tex_comment_wrap_width(widths)
for line in comment_lines:
wrapped = self._wrap_tex_comment_line('#' + line, wrap_width)
for part in wrapped:
comment = self._tex_escape(part)
if self._color_enabled:
content = '\\textcolor{gray}{\\ttfamily %s}' % comment
else:
content = '\\ttfamily %s' % comment
print('\\multicolumn{%d}{l}{%s} \\\\' % (colspan, content))
if self.print_column_names:
print('%s \\\\' % ' & '.join('\\textbf{%s}' % self._tex_escape(attr.upper()) for attr in self.attrs))
print('\\hline')
macro = {
'ord': '\\OR',
'form': '\\FO',
'lemma': '\\LE',
'upos': '\\UP',
'xpos': '\\XP',
'feats': '\\FE',
'head': '\\HE',
'deprel': '\\DE',
'deps': '\\DP',
'misc': '\\MI',
}
for row in rows:
cells = []
for attr in self.attrs:
display, _, tip = self._fit_value(attr, row.get(attr), widths[attr], pad=False)
text = self._tex_escape(display)
if attr in ('feats', 'misc') and tip is not None:
text = '%s{%s}[%s]' % (macro[attr], text, self._tex_escape_tooltip(tip))
elif attr in ('feats', 'misc'):
text = '%s{%s}' % (macro[attr], text)
elif tip is not None:
text = '\\tooltip{%s}{%s}' % (text, self._tex_escape_tooltip(tip))
text = '%s{%s}' % (macro[attr], text)
else:
text = '%s{%s}' % (macro[attr], text)
cells.append(text)
print('%s \\\\' % ' & '.join(cells))
print('\\end{tabular}')
if self.tex_style == 'resize':
print('\\end{adjustbox}')
print('\\endgroup')
if self.tex_style != 'standalone':
print('\\bigskip')
print('')
def _header_line(self, widths, colorize=False):
parts = []
for attr in self.attrs:
width = widths[attr]
head = self._fit_text(attr.upper(), width, align_right=(attr == 'head'))
if colorize:
head = self._colorize_ansi(attr, head, marked=False)
parts.append(head)
return ' '.join(parts)
def _separator_line(self, widths):
return ' '.join('-' * widths[attr] for attr in self.attrs)
def _row_line(self, row, widths, colorize=False):
parts = []
marked = row['_is_marked']
for attr in self.attrs:
width = widths[attr]
display, _, _ = self._fit_value(attr, row.get(attr), width, pad=True,
align_right=(attr == 'head'))
if colorize:
display = self._colorize_ansi(attr, display, marked=marked)
parts.append(display)
return ' '.join(parts)
def _fit_value(self, attr, value, width, pad=True, align_right=False):
text = '_' if value is None else str(value)
display, truncated = self._fit_text_with_flag(text, width, pad=pad, align_right=align_right)
tip = None
if self.tooltip:
if truncated:
tip = text
if self.format == 'html' and attr in ('feats', 'misc') and '|' in text:
tip = text.replace('|', '\n')
elif self.tooltip_feats_misc and attr in ('feats', 'misc') and '|' in text:
tip = text if self.format == 'tex' else text.replace('|', '\n')
return display, truncated, tip
@staticmethod
def _fit_text(text, width, align_right=False):
return PrettyConllu._fit_text_with_flag(text, width, pad=True, align_right=align_right)[0]
@staticmethod
def _fit_text_with_flag(text, width, pad=True, align_right=False):
if width <= 0:
return '', False
if len(text) <= width:
if not pad:
return text, False
return (text.rjust(width) if align_right else text.ljust(width)), False
if width <= 3:
return ('.' * width), True
return (text[:width - 3] + '...'), True
def _print_comments_html(self, tree):
lines = self._collect_comment_lines(tree)
if not lines:
return
print('<pre class="comments">')
for line in lines:
print(escape('#' + line))
print('</pre>')
[docs]
def is_marked(self, node):
return self.mark_re.search(str(node.misc)) if self.mark_re is not None else False
@staticmethod
def _colorize_ansi(attr, value, marked=False):
color = ANSI_COLOR_OF.get(attr)
return colored(value, color, attrs=['reverse', 'bold'] if marked else None)
@staticmethod
def _tex_escape(value):
replacements = {
'\\': r'\textbackslash{}',
'&': r'\&',
'%': r'\%',
'$': r'\$',
'#': r'\#',
'_': r'\_',
'{': r'\{',
'}': r'\}',
'~': r'\textasciitilde{}',
'^': r'\textasciicircum{}',
}
return ''.join(replacements.get(ch, ch) for ch in value)
def _tex_comment_wrap_width(self, widths):
# Approximate visible table width in monospace characters: sum of column
# widths plus textual separators between columns.
return max(24, sum(widths[attr] for attr in self.attrs) + 3 * (len(self.attrs) - 1))
@staticmethod
def _wrap_tex_comment_line(text, width):
return textwrap.wrap(text, width=width, break_long_words=False, break_on_hyphens=False) or [text]
def _tex_escape_tooltip(self, value):
return self._tex_escape(value).replace('|', r'\string|')
def _print_html_header(self):
print('<!DOCTYPE html>')
print('<html lang="en">')
print('<head>')
print(' <meta charset="utf-8">')
print(' <title>PrettyConllu</title>')
print(' <style>')
print(' body { background: #ffffff; color: #222; font-family: "DejaVu Sans Mono", "Liberation Mono", monospace; margin: 16px; }')
if self._color_enabled:
print(' pre.comments { color: #4d4d4d; margin: 0 0 8px 0; }')
else:
print(' pre.comments { margin: 0 0 8px 0; }')
print(' pre.sentence-gap { margin: 0 0 10px 0; line-height: 1; }')
print(' table.prettyconllu { border-collapse: collapse; margin-bottom: 18px; }')
print(' table.prettyconllu th, table.prettyconllu td { padding: 2px 8px; border: 1px solid #d8d8d8; white-space: pre; }')
print(' table.prettyconllu td { position: relative; }')
print(' table.prettyconllu td .copy-full { position: absolute; left: 8px; right: 8px; top: 2px; bottom: 2px; color: transparent; white-space: pre; user-select: text; overflow: hidden; }')
print(' table.prettyconllu td .display-short { user-select: none; }')
print(' table.prettyconllu th { background: #f5f5f5; }')
if self._color_enabled:
print(' table.prettyconllu tr.marked { background: #fff4cf; }')
print(' table.prettyconllu td.head, table.prettyconllu th.head { text-align: right; }')
if self._color_enabled:
for attr, color in HTML_COLOR_OF.items():
print(' table.prettyconllu .%s { color: %s; }' % (attr, color))
print(' </style>')
print('</head>')
print('<body>')
def _print_tex_header(self):
if self.tex_style == 'standalone':
print('\\documentclass[multi=mypage]{standalone}')
else:
print('\\documentclass[11pt]{article}')
print('\\usepackage[margin=1.8cm]{geometry}')
if self._color_enabled:
print('\\usepackage[table]{xcolor}')
if self.tex_style == 'resize':
print('\\usepackage{adjustbox}')
print('\\usepackage{pdfcomment}')
print('\\usepackage{xparse}')
print('\\usepackage[T1]{fontenc}')
print('\\usepackage[utf8]{inputenc}')
print('\\usepackage{textcomp}')
print('\\setlength{\\parindent}{0pt}')
print('\\newcommand{\\tooltip}[2]{\\pdftooltip{#1}{#2}}')
print('\\newenvironment{mypage}{}{}')
if self._color_enabled:
print('\\def\\CL#1{\\noindent{\\color{gray}\\ttfamily #1}\\par}')
print('\\def\\OR#1{\\textcolor{PrettyOrd}{#1}}')
print('\\def\\FO#1{\\textcolor{PrettyForm}{#1}}')
print('\\def\\LE#1{\\textcolor{PrettyLemma}{#1}}')
print('\\def\\UP#1{\\textcolor{PrettyUpos}{#1}}')
print('\\def\\XP#1{\\textcolor{PrettyXpos}{#1}}')
print('\\NewDocumentCommand{\\FE}{m o}{\\textcolor{PrettyFeats}{\\IfNoValueTF{#2}{#1}{\\tooltip{#1}{#2}}}}')
print('\\def\\HE#1{\\textcolor{PrettyHead}{#1}}')
print('\\def\\DE#1{\\textcolor{PrettyDeprel}{#1}}')
print('\\def\\DP#1{\\textcolor{PrettyDeps}{#1}}')
print('\\NewDocumentCommand{\\MI}{m o}{\\textcolor{PrettyMisc}{\\IfNoValueTF{#2}{#1}{\\tooltip{#1}{#2}}}}')
print('\\definecolor{PrettyOrd}{HTML}{226622}')
print('\\definecolor{PrettyForm}{HTML}{7A3E00}')
print('\\definecolor{PrettyLemma}{HTML}{5A2B8F}')
print('\\definecolor{PrettyUpos}{HTML}{B00020}')
print('\\definecolor{PrettyXpos}{HTML}{4C4C4C}')
print('\\definecolor{PrettyFeats}{HTML}{7A1EA1}')
print('\\definecolor{PrettyHead}{HTML}{116611}')
print('\\definecolor{PrettyDeprel}{HTML}{0A4FB5}')
print('\\definecolor{PrettyDeps}{HTML}{0A7B7B}')
print('\\definecolor{PrettyMisc}{HTML}{B24800}')
else:
print('\\def\\CL#1{\\noindent{\\ttfamily #1}\\par}')
print('\\def\\OR#1{#1}')
print('\\def\\FO#1{#1}')
print('\\def\\LE#1{#1}')
print('\\def\\UP#1{#1}')
print('\\def\\XP#1{#1}')
print('\\NewDocumentCommand{\\FE}{m o}{\\IfNoValueTF{#2}{#1}{\\tooltip{#1}{#2}}}')
print('\\def\\HE#1{#1}')
print('\\def\\DE#1{#1}')
print('\\def\\DP#1{#1}')
print('\\NewDocumentCommand{\\MI}{m o}{\\IfNoValueTF{#2}{#1}{\\tooltip{#1}{#2}}}')
print('\\begin{document}')