"""CorefHtml class is a writer for HTML+JavaScript visualization of coreference.
When using lazy loading of documents (infinite scrolling),
modern browsers don't allow JavaScript to load files from a local file system
("Access to XMLHttpRequest at 'file://.../doc2.html' from origin 'null' has been
blocked by CORS policy: Cross origin requests are only supported for protocol schemes:
http, data, chrome, chrome-extension, https.")
The recommended solution is to start a local web server, e.g. using
python -m http.server
and browse http://0.0.0.0:8000/my.html.
Non-recommended solution is to run
google-chrome --new-window --user-data-dir=/tmp/chrome-proxy --allow-file-access-from-files my.html
"""
from udapi.core.basewriter import BaseWriter
from udapi.core.coref import span_to_nodes, CorefEntity, CorefMention
from collections import Counter
import udapi.block.write.html
import gzip
import sys
import os
import re
ETYPES = 'person place organization animal plant object substance time number abstract event'.split()
HTYPES = 'PROPN NOUN PRON VERB DET OTHER'.split()
HEADER = '''
<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8">
<title>Udapi CorefUD viewer</title>
<script src="https://code.jquery.com/jquery-3.6.3.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/pako/2.1.0/pako.min.js"></script>
'''
CSS = '''
#wrap {display: flex; align-items: flex-start;}
#main {width: 100%; padding: 5px; background: white; z-index:100;}
#overview { position: sticky; top: 0; overflow-y: scroll; height:95vh; resize:horizontal;
display: grid; border-right: double;
padding: 5px; width: 20em; background: #ddd; border-radius: 5px;
}
#main-menu {position:fixed; z-index:150; top: 4px; right:4px; display:none;
padding: 5px 55px 5px 5px; background-color:gray; border-radius: 5px;}
#main-menu div {display: inline-block;}
#menubtn {position: fixed; right: 8px; top: 8px; z-index: 200;}
#menubtn div {width: 30px; height: 4px; background-color: black; margin: 5px 0; transition: 0.4s;}
.change .b1 {transform: translate(0, 9px) rotate(-45deg);}
.change .b2 {opacity: 0;}
.change .b3 {transform: translate(0, -9px) rotate(45deg);}
.m {border: 1px solid black; border-radius: 5px; padding: 2px; display:inline-block;}
.nobox {border:1px solid transparent; padding:0; background: transparent !important; display: inline}
.nobox .labels {display: inline;}
.nocolor {color: black !important;}
.nobold {font-weight: normal;}
.labels {display: block; font-size: 10px;}
.showtree {margin: 5px; user-select: none;}
.display-inline {display: inline;}
.close{float:right; font-weight: 900; font-size: 30px; width: 36px; height: 36px; padding: 2px}
i.empty {color: gray; border: 3px outset gray; padding: 1px;}
.sentence .singleton {border-style: dotted;}
.crossing:before {content: "!"; display: block; background: #ffd500;}
.active {border: 1px solid red !important;}
.selected {background: red !important; text-shadow: 1px 1px 4px white, -1px 1px 4px white, 1px -1px 4px white, -1px -1px 4px white;}
.sent_id {display: none; background: #ddd; border-radius: 3px;}
'''
SCRIPT_BASE = '''
function add_mention_listeners(mentions){
mentions.click(function(e) {
let was_selected = $(this).hasClass("selected");
$(".m").removeClass("selected");
if (!was_selected) {$("."+$(this).attr("class").split(" ")[0]).addClass("selected");}
e.stopPropagation();
});
mentions.hover(
function(e) {$(".m").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");},
function(e) {$(".m").removeClass("active");}
);
}
add_mention_listeners($(".m"));
window.onhashchange = function() {
$(".m").removeClass("selected");
var fragment = window.location.hash.substring(1);
if (fragment) {$("." + fragment).addClass("selected");}
}
function menuclick(x) {
x.classList.toggle("change");
$("#main-menu").toggle();
}
async function load_doc(doc_num) {
loading_now = true;
let filename = docs_dir + "/doc" + doc_num + ".html.gz"
console.log("loading " + filename);
try {
const res = await fetch(filename);
let raw = await res.arrayBuffer();
data = pako.inflate(raw, {to: "string"});
} catch (error){
if (! load_fail_reported) {
load_fail_reported = true;
alert("Cannot load " + filename + "\\nLocal files do not support lazy loading."
+ " Run a web server 'python -m http.server'\\n"
+ "error = " + error);
}
}
$("#main").append(data);
add_mention_listeners($("#doc" + doc_num + " .m"));
$("#doc" + doc_num + " .sentence").each(add_show_tree_button);
$('.eid').toggle($('#show-eid')[0].checked);
$('.etype').toggle($('#show-etype')[0].checked);
$('.sent_id').toggle($('#show-sent_id')[0].checked);
$('.showtree').toggle($('#show-trees')[0].checked);
$('.m').toggleClass('nocolor', ! $('#show-color')[0].checked);
$('.m').toggleClass('nobox', ! $('#show-boxes')[0].checked);
$('.norm').toggle($('#show-norm')[0].checked);
$('.head').toggleClass('nobold', ! $('#show-heads')[0].checked);
$('.empty').toggle($('#show-empty')[0].checked);
$('.sentence').toggleClass('display-inline', ! $('#show-breaks')[0].checked);
$('.par').toggle($('#show-pars')[0].checked);
$('h1').toggle($('#show-docs')[0].checked);
$('.m').toggleClass('htype',$('#htype')[0].checked)
loading_now = false;
}
var docs_loaded = 1;
var load_fail_reported = false;
var loading_now = false;
add_show_tree_button = function(index, el){ // to be redefined later if show_trees=True
$(el).prepend('<span class="sent_id">🆔' + el.dataset.id + '</span>');
}
function load_more() {
if (!loading_now && $(window).scrollTop() >= $(document).height() - $(window).height() - 42 && docs_loaded < all_docs) {
docs_loaded += 1;
load_doc(docs_loaded);
}
}
$(window).scroll(load_more);
const resizeObserver = new ResizeObserver(entries =>load_more());
resizeObserver.observe(document.body);
'''
SCRIPT_SHOWTREE = '''
function show_tree_in_tdiv(tdiv, doc_number, index){
tdiv.treexView([docs_json[doc_number][index]]);
$("<button>", {append:"×", class:"close"}).prependTo(tdiv).on("click", function(){tdiv.remove();});
}
var load_json_fail_reported = false;
add_show_tree_button = function(index, el){
var sent_id = el.id;
$(el).prepend('<span class="sent_id">🆔' + el.dataset.id + '</span>');
$(el).prepend(
$("<button>", {append: "🌲", id:"button-"+sent_id, title: "show dependency tree "+el.dataset.id, class: "showtree"}).on("click", async function() {
var tree_div = $("#tree-"+sent_id);
if (tree_div.length == 0){
$('#button-'+sent_id).attr('title', 'hide dependency tree '+el.dataset.id);
var tdiv = $("<div>", {id:"tree-"+sent_id, class:"tree"}).insertAfter($(el));
doc_number = 1 * el.parentElement.id.substr(3);
if (docs_json[doc_number]){
show_tree_in_tdiv(tdiv, doc_number, index);
} else {
try {
console.log("loading doc" + doc_number + ".json.gz");
const res = await fetch(docs_dir + "/doc" + doc_number + ".json.gz");
let raw = await res.arrayBuffer();
docs_json[doc_number] = JSON.parse(pako.inflate(raw, {to: "string"}));
show_tree_in_tdiv(tdiv, doc_number, index);
} catch(error) {
if (! load_json_fail_reported) {
load_json_fail_reported = true;
alert("Cannot load " + docs_dir + "/doc" + doc_number + ".json.gz:\\n" + error);
}
}
}
} else {
tree_div.remove();
$('#button-'+sent_id).attr('title', 'show dependency tree '+el.dataset.id);
}
})
);
}
'''
WRITE_HTML = udapi.block.write.html.Html()
[docs]
class CorefHtml(BaseWriter):
def __init__(self, docs_dir='docs', path_to_js='web',
show_trees=True, show_eid=False, show_etype=False, colors=7, rtl=None, **kwargs):
super().__init__(**kwargs)
self.path_to_js = path_to_js
self.show_trees = show_trees
self.show_eid = show_eid
self.show_etype = show_etype
self.colors = colors
self.rtl = rtl
self.js_docs_dir = docs_dir
self.docs_dir = docs_dir
if self.path:
new_dir, _ = os.path.split(self.path)
self.docs_dir = os.path.join(new_dir, docs_dir)
if docs_dir != '.' and not os.path.exists(self.docs_dir):
os.makedirs(self.docs_dir)
self._mention_ids = {}
self._entity_colors = {}
def _representative_word(self, entity):
# return the first PROPN or NOUN. Or the most frequent one?
heads = [m.head for m in entity.mentions]
lemma_or_form = lambda n: n.lemma if n.lemma and n.lemma != '_' else n.form
for upos in ('PROPN', 'NOUN'):
nodes = [n for n in heads if n.upos == upos]
if nodes:
return lemma_or_form(nodes[0])
return lemma_or_form(heads[0])
[docs]
def process_ud_doc(self, ud_doc, doc_num):
print(f'<div class="doc" id="doc{doc_num}">')
for tree in ud_doc:
self.process_tree(tree)
print('</div>')
[docs]
def process_document(self, doc):
ud_docs, doc_num, sent_id2doc = [], 0, {}
for tree in doc.trees:
if tree.newdoc or not ud_docs:
ud_docs.append([])
doc_num += 1
ud_docs[-1].append(tree)
sent_id2doc[tree.sent_id] = doc_num
# TODO: use sent_id2doc
print('<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8">')
print('<title>Udapi CorefUD viewer</title>')
if self.path_to_js == 'web':
print('<script src="https://code.jquery.com/jquery-3.6.3.min.js"></script>')
print('<script src="https://cdnjs.cloudflare.com/ajax/libs/pako/2.1.0/pako.min.js"></script>')
if self.show_trees:
print('<script src="https://cdn.rawgit.com/ufal/js-treex-view/gh-pages/js-treex-view.js"></script>')
else:
print(f'<script src="{self.path_to_js}/jquery-3.6.3.min.js"></script>')
print(f'<script src="{self.path_to_js}/pako.min.js"></script>')
if self.show_trees:
print(f'<script src="{self.path_to_js}/js-treex-view.js"></script>')
print('<style>' + CSS)
for i, etype in enumerate(ETYPES):
print(f'.{etype} {{background: hsl({int(i * 360/len(ETYPES))}, 80%, 85%);}}')
print('.other {background: hsl(0, 0%, 85%);}')
for i, htype in enumerate(HTYPES[:-1]):
print(f'.htype.{htype} {{background: hsl({int(i * 360/len(HTYPES))}, 80%, 85%);}}')
print('.htype.OTHER {background: hsl(0, 0%, 85%);}')
if self.colors:
for i in range(self.colors):
print(f'.c{i} {{color: hsl({int(i * 360/self.colors)}, 100%, 30%);}}')
if not self.show_eid:
print('.eid {display: none;}')
if not self.show_etype:
print('.etype {display: none;}')
print('</style>')
print('</head>\n<body>\n<div id="wrap">')
self._mention_ids = {}
self._entity_colors = {}
entities_of_type = Counter()
for entity in doc.coref_entities:
if self.colors:
count = entities_of_type[entity.etype]
entities_of_type[entity.etype] = count + 1
self._entity_colors[entity] = f'c{count % self.colors}'
for idx, mention in enumerate(entity.mentions, 1):
self._mention_ids[mention] = f'{_dom_esc(entity.eid)}e{idx}'
print('<div id="overview">')
print('<table><thead><tr><th title="entity id">eid</th>'
'<th title="number of mentions">#m</th>'
'<th title="a word best representing the entity">word</th></tr></thead>\n<tbody>')
for entity in doc.coref_entities:
print(f'<tr><td><a href="#{_dom_esc(entity.eid)}">{entity.eid}</a></td>'
f'<td>{len(entity.mentions)}</td>'
f'<td>{self._representative_word(entity)}</td></tr>')
print('</tbody></table>')
print('</div>')
print('<div id="main">')
print('<div id="main-menu">Show<br><div>\n'
f' <input id="show-eid" type="checkbox" {"checked" if self.show_eid else ""} onclick="$(\'.eid\').toggle(this.checked);"><label for="show-eid">eid</label><br>\n'
f' <input id="show-etype" type="checkbox" {"checked" if self.show_etype else ""} onclick="$(\'.etype\').toggle(this.checked);"><label for="show-etype">etype</label><br>\n'
' <input id="show-sent_id" type="checkbox" onclick="$(\'.sent_id\').toggle(this.checked);"><label for="show-sent_id">sent_id</label><br>\n'
+ (' <input id="show-trees" type="checkbox" checked onclick="$(\'.showtree\').toggle(this.checked);"><label for="show-trees">trees</label><br>\n' if self.show_trees else '') +
' <input id="show-color" type="checkbox" checked onclick="$(\'.m\').toggleClass(\'nocolor\',!this.checked);"><label for="show-color">colors</label><br>\n'
' <input id="show-boxes" type="checkbox" checked onclick="$(\'.m\').toggleClass(\'nobox\',!this.checked);"><label for="show-boxes">boxes</label></div><div>\n'
' <input id="show-norm" type="checkbox" checked onclick="$(\'.norm\').toggle(this.checked);"><label for="show-norm">non-mentions</label><br>\n'
' <input id="show-heads" type="checkbox" checked onclick="$(\'.head\').toggleClass(\'nobold\',!this.checked);"><label for="show-heads">heads in bold</label><br>\n'
' <input id="show-empty" type="checkbox" checked onclick="$(\'.empty\').toggle(this.checked);"><label for="show-empty">empty words</label><br>\n'
' <input id="show-breaks" type="checkbox" checked onclick="$(\'.sentence\').toggleClass(\'display-inline\',!this.checked);"><label for="show-breaks">sentence per line</label><br>\n'
' <input id="show-pars" type="checkbox" checked onclick="$(\'.par\').toggle(this.checked);"><label for="show-pars">paragraphs</label><br>\n'
' <input id="show-docs" type="checkbox" checked onclick="$(\'h1\').toggle(this.checked);"><label for="show-docs">document names</label><br>\n'
'</div><fieldset onclick="$(\'.m\').toggleClass(\'htype\',$(\'#htype\')[0].checked)"><legend>bg color:</legend>\n'
'<label><input type="radio" name="bgcolor" id="etype" checked>entity type</label>\n'
'<label><input type="radio" name="bgcolor" id="htype">head upos</label>\n'
'</fieldset>\n'
'</div>\n'
'<button id="menubtn" title="Visualization options" onclick="menuclick(this)"><div class="b1"></div><div class="b2"></div><div class="b3"></div></button>\n'
)
# The first ud_doc will be printed to the main html file.
self.process_ud_doc(ud_docs[0], 1)
print('</div>') # id=main
# Other ud_docs will be printed into separate files (so they can be loaded lazily)
orig_stdout = sys.stdout
try:
for i, ud_doc in enumerate(ud_docs[1:], 2):
sys.stdout = gzip.open(f"{self.docs_dir}/doc{i}.html.gz", 'wt')
self.process_ud_doc(ud_doc, i)
sys.stdout.close()
finally:
sys.stdout = orig_stdout
print(f'<script>\nvar all_docs = {len(ud_docs)};\nvar docs_dir = "{self.js_docs_dir}";')
print(SCRIPT_BASE)
if self.show_trees:
print('docs_json = [false, ', end='') # 1-based index, so dummy docs_json[0]
WRITE_HTML.print_doc_json(ud_docs[0])
print('];')
try:
for i, ud_doc in enumerate(ud_docs[1:], 2):
sys.stdout = gzip.open(f"{self.docs_dir}/doc{i}.json.gz", 'wt')
WRITE_HTML.print_doc_json(ud_doc)
sys.stdout.close()
finally:
sys.stdout = orig_stdout
print(SCRIPT_SHOWTREE)
print('$("#doc1 .sentence").each(add_show_tree_button);')
print('</script>')
print('</div></body></html>')
def _start_subspan(self, subspan, crossing=False):
m = subspan.mention
e = m.entity
classes = f'{_dom_esc(e.eid)} {self._mention_ids[m]} {e.etype or "other"} m'
title = f'eid={subspan.subspan_eid}\netype={e.etype}\nhead={m.head.form}'
classes += f" {m.head.upos if m.head.upos in HTYPES else 'OTHER'}"
title += f'\nhead-upos={m.head.upos}'
if self.colors:
classes += f' {self._entity_colors[e]}'
if all(w.is_empty() for w in subspan.words):
classes += ' empty'
if len(e.mentions) == 1:
classes += ' singleton'
if crossing:
classes += ' crossing'
title += '\ncrossing'
if m.other:
title += f'\n{m.other}'
span_id = ''
if (subspan.subspan_id == '' or subspan.subspan_id.startswith('[1/')) and e.mentions[0] == m:
span_id = f'id="{_dom_esc(e.eid)}" '
# The title should be always rendered left-to-right (e.g. "head=X", not "X=head"),
# so for RTL languages, we need to use explicit dir="ltr" and insert a nested span with dir="rtl".
if self.rtl:
print(f'<span {span_id}class="{classes}" title="{title}" dir="ltr">'
f'<span class="labels"><b class="eid">{_dom_esc(subspan.subspan_eid)}</b>'
f' <i class="etype">{e.etype}</i></span><span dir="rtl">', end='')
else:
print(f'<span {span_id}class="{classes}" title="{title}">'
f'<span class="labels"><b class="eid">{_dom_esc(subspan.subspan_eid)}</b>'
f' <i class="etype">{e.etype}</i></span>', end='')
[docs]
def process_tree(self, tree):
mentions = set()
nodes_and_empty = tree.descendants_and_empty
for node in nodes_and_empty:
for m in node.coref_mentions:
mentions.add(m)
subspans = []
for mention in mentions:
subspans.extend(mention._subspans())
subspans.sort(reverse=True)
if tree.newdoc:
print(f'<hr><h1>{tree.newdoc if tree.newdoc is not True else ""}</h1><hr>')
elif tree.newpar:
print('<hr class="par">')
opened, prev_node_mention = [], True
rtl = ' dir="rtl"' if self.rtl else ""
print(f'<p class="sentence" data-id="{tree.sent_id}" id="{_id(tree)}"{rtl}>')
for node in nodes_and_empty:
if not prev_node_mention and subspans and subspans[-1].words[0] == node:
print('</span>', end='')
while subspans and subspans[-1].words[0] == node:
subspan = subspans.pop()
self._start_subspan(subspan)
opened.append(subspan)
if not opened and prev_node_mention:
print('<span class="norm">', end='')
prev_node_mention = True if opened else False
is_head = self._is_head(node)
if is_head:
print('<b class="head">', end='')
if node.is_empty():
print('<i class="empty">', end='')
print(node.form, end='')
if node.is_empty():
print('</i>', end='')
if is_head:
print('</b>', end='')
while opened and opened[-1].words[-1] == node:
if self.rtl:
print('</span></span>', end='')
else:
print('</span>', end='')
opened.pop()
# Two mentions are crossing iff their spans have non-zero intersection,
# but neither is a subset of the other, e.g. (e1 ... (e2 ... e1) ... e2).
# Let's visualize this (simplified) as
# <span class=e1>...<span class=e2>...</span></span><span class="e2 crossing">...</span>
# i.e. let's split mention e2 into two subspans which are next to each other.
# Unfortunatelly, we cannot mark now both crossing mentions using html class "crossing"
# (opening tags are already printed), so we'll mark only the second part of the second mention.
endings = [x for x in opened if x.words[-1] == node]
if endings:
new_opened, brokens, found_crossing = [], [], False
for subspan in opened:
if subspan.words[-1] == node:
found_crossing = True
elif found_crossing:
brokens.append(subspan)
else:
new_opened.append(subspan)
opened = new_opened
print('</span>' * (len(endings) + len(brokens)), end='')
for broken in brokens:
self._start_subspan(broken, True)
opened.append(subspan)
if not node.no_space_after:
print(' ', end='')
if not prev_node_mention:
print('</span>', end='')
print('</p>')
def _is_head(self, node):
for mention in node.coref_mentions:
if mention.head == node:
return mention
return None
# id needs to be a valid DOM querySelector
# so it cannot contain [#./:] and maybe more,
# so let's substitute all [^\w\d-] to be on the safe side.
# DOM IDs cannot start with a digit, so prepend e.g. "n" if needed.
def _dom_esc(string):
if string[0].isdecimal():
string = 'n' + string
return re.sub(r'[^\w\d-]', '_', string)
def _id(node):
if node is None:
return 'null'
return _dom_esc(node.address())
def _esc(string):
if string is None:
string = ''
return string.replace('\\', '\\\\').replace('"', r'\"')