"""Html class is a writer for HTML+JavaScript+SVG visualization of dependency trees."""
from udapi.core.basewriter import BaseWriter
[docs]
class Html(BaseWriter):
"""A writer for HTML+JavaScript+SVG visualization of dependency trees.
.. code-block:: bash
# from the command line
udapy write.Html < file.conllu > file.html
firefox file.html
For offline use, we need to download first three JavaScript libraries::
wget https://code.jquery.com/jquery-2.1.4.min.js
wget https://cdn.rawgit.com/eligrey/FileSaver.js/1.3.4r/FileSaver.min.js
wget https://cdn.rawgit.com/ufal/js-treex-view/gh-pages/js-treex-view.js
udapy write.Html path_to_js=. < file.conllu > file.html
firefox file.html
This writer produces an html file with drawings of the dependency trees
in the document (there are buttons for selecting which bundle will be shown).
Under each node its form, upos and deprel are shown.
In the tooltip its lemma and (morphological) features are shown.
After clicking the node, all other attributes are shown.
When hovering over a node, the respective word in the (plain text) sentence
is highlighted. There is a button for downloading trees as SVG files.
Three JavaScript libraries are required (jquery, FileSaver and js-treex-view).
By default they are linked online (so Internet access is needed when viewing),
but they can be also downloaded locally (so offline browsing is possible and
the loading is faster): see the Usage example above.
This block is based on `Treex::View <https://metacpan.org/release/Treex-View>`_
but takes a different approach. `Treex::View` depends on (older version of)
`Valence` (Perl interface to `Electron <https://electron.atom.io/>`_)
and comes with a script `view-treex`, which takes a treex file,
converts it to json behind the scenes (which is quite slow)
and displays the json in a Valence window.
This block generates the json code directly to the html file,
so it can be viewed with any browser or even published online.
(Most of the html file is actually the json.)
When viewing the html file, the JavaScript library `js-treex-view`
generates an svg on the fly from the json.
"""
def __init__(self, path_to_js='web', **kwargs):
"""Create the writer.
Arguments:
* `path_to_js` path to jquery, FileSaver and js-treex-view.
`web` means
http://ufal.github.io/js-treex-view/js-treex-view.js,
https://cdn.rawgit.com/eligrey/FileSaver.js/master/FileSaver.min.js and
https://code.jquery.com/jquery-2.1.4.min.js
will be linked.
`path_to_js=.` means the libraries will be searched in the current directory.
"""
super().__init__(**kwargs)
self.path_to_js = path_to_js
[docs]
def process_document(self, doc):
if self.path_to_js == 'web':
jquery = 'https://code.jquery.com/jquery-2.1.4.min.js'
fsaver = 'https://cdn.rawgit.com/eligrey/FileSaver.js/1.3.4/FileSaver.min.js'
js_t_v = 'https://cdn.rawgit.com/ufal/js-treex-view/gh-pages/js-treex-view.js'
else:
jquery = self.path_to_js + '/jquery-2.1.4.min.js'
fsaver = self.path_to_js + '/FileSaver.min.js'
js_t_v = self.path_to_js + '/js-treex-view.js'
print('<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8">')
print('<title>Udapi viewer</title>') # TODO doc.loaded_from
for js_file in (jquery, fsaver, js_t_v):
print('<script src="%s"></script>' % js_file)
print('</head>\n<body>')
print('<button style="float:right" type="submit" onclick="saveTree()">'
'<span>Save as SVG</span></button><div id="treex-view"></div><script>')
print('data=', end='')
self.print_doc_json(doc)
print(';')
print("$('#treex-view').treexView(data);")
print('''function saveTree() {
var svg_el = jQuery('svg');
if (svg_el.length) {
var svg = new Blob([svg_el.parent().html()], {type: "image/svg+xml"});
saveAs(svg, 'tree.svg');
}
}''')
print('</script></body></html>')
[docs]
def print_doc_json(self, doc):
print('[')
for (bundle_number, bundle) in enumerate(doc, 1):
if bundle_number != 1:
print(',', end='')
print('{"zones":{', end='')
first_zone = True
desc = ''
try:
trees = bundle.trees
except:
trees = [bundle] # allow to call print_doc_json([tree1, tree2])
for tree in trees:
zone = tree.zone
if first_zone:
first_zone = False
else:
print(',', end='')
print('"%s":{"sentence":"%s",' % (zone, _esc(tree.text)), end='')
print('"trees":{"a":{"language":"%s","nodes":[' % zone)
print('{"id":%s,"parent":null,' % _id(tree), end='')
print('"firstson":' + _id(tree.children[0] if tree.children else None), end=',')
print('"labels":["zone=%s","id=%s"]}' % (zone, tree.address()))
desc += ',["[%s]","label"],[" ","space"]' % zone
for node in tree.descendants:
desc += self.print_node_json(node)
desc += r',["\n","newline"]'
print(']}}}')
# print desc without the extra starting comma
print('},"desc":[%s]}' % desc[1:])
print(']')
[docs]
@staticmethod
def print_node_json(node):
"""JSON representation of a given node."""
# pylint does not understand `.format(**locals())` and falsely alarms for unused vars
# pylint: disable=too-many-locals,unused-variable
names = ['ord', 'misc', 'form', 'lemma', 'upos', 'xpos', 'feats', 'deprel']
values = node.get_attrs(names, undefs='')
order, misc, form, lemma, upos, xpos, feats, deprel = [_esc(x) for x in values]
address = node.address()
id_node, id_parent = _id(node), _id(node.parent)
firstson = node.children[0] if node.children else None
rbrother = next((n for n in node.parent.children if node.precedes(n)), None)
firstson_str = '"firstson":%s,' % _id(firstson) if firstson else ''
rbrother_str = '"rbrother":%s,' % _id(rbrother) if rbrother else ''
multiline_feats = feats.replace('|', r'\n')
print(',{{"id":{id_node},"parent":{id_parent},"order":{order},{firstson_str}{rbrother_str}'
'"data":{{"ord":{order},"form":"{form}","lemma":"{lemma}","upos":"{upos}",'
'"xpos":"{xpos}","feats":"{feats}","deprel":"{deprel}",' # TODO: deps
'"misc":"{misc}","id":"{address}"}},'
'"labels":["{form}","#{{#bb0000}}{upos}","#{{#0000bb}}{deprel}"],'
'"hint":"lemma={lemma}\\n{multiline_feats}"}}'.format(**locals()))
desc = ',["{form}",{id_node}]'.format(**locals())
desc += ',[" ","space"]' if 'SpaceAfter=No' not in misc else ''
# pylint: enable=too-many-locals,unused-variable
return desc
# id needs to be a valid DOM querySelector
# so it cannot contain # nor / and it cannot start with a digit
def _id(node):
if node is None:
return 'null'
return '"n%s"' % node.address().replace('#', '-').replace('/', '-')
def _esc(string):
if string is None:
string = ''
return string.replace('\\', '\\\\').replace('"', r'\"')