Source code for udapi.block.write.corefhtml

"""CorefHtml class is a writer for HTML+JavaScript visualization of coreference.

When using lazy loading of documents (infinite scrolling),
modern browsers don't allow JavaScript to load files from a local file system
("Access to XMLHttpRequest at 'file://.../doc2.html' from origin 'null' has been
blocked by CORS policy: Cross origin requests are only supported for protocol schemes:
http, data, chrome, chrome-extension, https.")

The recommended solution is to start a local web server, e.g. using
  python -m http.server
and browse http://0.0.0.0:8000/my.html.

Non-recommended solution is to run
 google-chrome --new-window --user-data-dir=/tmp/chrome-proxy --allow-file-access-from-files my.html
"""
from udapi.core.basewriter import BaseWriter
from udapi.core.coref import span_to_nodes, CorefEntity, CorefMention
from collections import Counter
import udapi.block.write.html
import gzip
import sys
import os
import re

ETYPES = 'person place organization animal plant object substance time number abstract event'.split()

HTYPES = 'PROPN NOUN PRON VERB DET OTHER'.split()

HEADER = '''
<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8">
<title>Udapi CorefUD viewer</title>
<script src="https://code.jquery.com/jquery-3.6.3.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/pako/2.1.0/pako.min.js"></script>
'''

CSS = '''
#wrap {display: flex; align-items: flex-start;}
#main {width: 100%; padding: 5px; background: white; z-index:100;}
#overview { position: sticky; top: 0; overflow-y: scroll; height:95vh; resize:horizontal;
            display: grid; border-right: double;
            padding: 5px; width: 20em; background: #ddd; border-radius: 5px;
}
#main-menu {position:fixed; z-index:150; top: 4px; right:4px; display:none;
            padding: 5px 55px 5px 5px; background-color:gray; border-radius: 5px;}
#main-menu div {display: inline-block;}
#menubtn {position: fixed; right: 8px; top: 8px; z-index: 200;}
#menubtn div {width: 30px; height: 4px; background-color: black; margin: 5px 0; transition: 0.4s;}
.change .b1 {transform: translate(0, 9px) rotate(-45deg);}
.change .b2 {opacity: 0;}
.change .b3 {transform: translate(0, -9px) rotate(45deg);}

.m {border: 1px solid black; border-radius: 5px; padding: 2px; display:inline-block;}
.nobox {border:1px solid transparent; padding:0; background: transparent !important; display: inline}
.nobox .labels {display: inline;}
.nocolor {color: black !important;}
.nobold {font-weight: normal;}
.labels {display: block; font-size: 10px;}
.showtree {margin: 5px; user-select: none;}
.display-inline {display: inline;}
.close{float:right; font-weight: 900; font-size: 30px; width: 36px; height: 36px; padding: 2px}
i.empty {color: gray; border: 3px outset gray; padding: 1px;}
.sentence .singleton {border-style: dotted;}
.crossing:before {content: "!"; display: block; background: #ffd500;}
.active {border: 1px solid red !important;}
.selected {background: red !important; text-shadow: 1px 1px 4px white, -1px 1px 4px white, 1px -1px 4px white, -1px -1px 4px white;}
.sent_id {display: none; background: #ddd; border-radius: 3px;}
'''

SCRIPT_BASE = '''
function add_mention_listeners(mentions){
 mentions.click(function(e) {
   let was_selected = $(this).hasClass("selected");
   $(".m").removeClass("selected");
   if (!was_selected) {$("."+$(this).attr("class").split(" ")[0]).addClass("selected");}
   e.stopPropagation();
  });
 mentions.hover(
   function(e) {$(".m").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");},
   function(e) {$(".m").removeClass("active");}
  );
}
add_mention_listeners($(".m"));

window.onhashchange = function() {
 $(".m").removeClass("selected");
 var fragment = window.location.hash.substring(1);
 if (fragment) {$("." + fragment).addClass("selected");}
}

function menuclick(x) {
  x.classList.toggle("change");
  $("#main-menu").toggle();
}

async function load_doc(doc_num) {
  loading_now = true;
  let filename = docs_dir + "/doc" + doc_num + ".html.gz"
  console.log("loading " + filename);
  try {
    const res = await fetch(filename);
    let raw = await res.arrayBuffer();
    data = pako.inflate(raw, {to: "string"});
  } catch (error){
    if (! load_fail_reported) {
      load_fail_reported = true;
      alert("Cannot load " + filename + "\\nLocal files do not support lazy loading."
      + " Run a web server 'python -m http.server'\\n"
      + "error = " + error);
    }
  }
  $("#main").append(data);
  add_mention_listeners($("#doc" + doc_num + " .m"));
  $("#doc" + doc_num + " .sentence").each(add_show_tree_button);
  $('.eid').toggle($('#show-eid')[0].checked);
  $('.etype').toggle($('#show-etype')[0].checked);
  $('.sent_id').toggle($('#show-sent_id')[0].checked);
  $('.showtree').toggle($('#show-trees')[0].checked);
  $('.m').toggleClass('nocolor', ! $('#show-color')[0].checked);
  $('.m').toggleClass('nobox', ! $('#show-boxes')[0].checked);
  $('.norm').toggle($('#show-norm')[0].checked);
  $('.head').toggleClass('nobold', ! $('#show-heads')[0].checked);
  $('.empty').toggle($('#show-empty')[0].checked);
  $('.sentence').toggleClass('display-inline', ! $('#show-breaks')[0].checked);
  $('.par').toggle($('#show-pars')[0].checked);
  $('h1').toggle($('#show-docs')[0].checked);
  $('.m').toggleClass('htype',$('#htype')[0].checked)
  loading_now = false;
}

var docs_loaded = 1;
var load_fail_reported = false;
var loading_now = false;
add_show_tree_button = function(index, el){ // to be redefined later if show_trees=True
  $(el).prepend('<span class="sent_id">🆔' + el.dataset.id + '</span>');
}
function load_more() {
  if (!loading_now && $(window).scrollTop() >= $(document).height() - $(window).height() - 42 && docs_loaded < all_docs) {
    docs_loaded += 1;
    load_doc(docs_loaded);
  }
}
$(window).scroll(load_more);
const resizeObserver = new ResizeObserver(entries =>load_more());
resizeObserver.observe(document.body);
'''

SCRIPT_SHOWTREE = '''
function show_tree_in_tdiv(tdiv, doc_number, index){
  tdiv.treexView([docs_json[doc_number][index]]);
  $("<button>", {append:"×", class:"close"}).prependTo(tdiv).on("click", function(){tdiv.remove();});
}

var load_json_fail_reported = false;
add_show_tree_button = function(index, el){
  var sent_id = el.id;
  $(el).prepend('<span class="sent_id">🆔' + el.dataset.id + '</span>');
  $(el).prepend(
    $("<button>", {append: "🌲", id:"button-"+sent_id, title: "show dependency tree "+el.dataset.id, class: "showtree"}).on("click", async function() {
      var tree_div = $("#tree-"+sent_id);
      if (tree_div.length == 0){
        $('#button-'+sent_id).attr('title', 'hide dependency tree '+el.dataset.id);
        var tdiv = $("<div>", {id:"tree-"+sent_id, class:"tree"}).insertAfter($(el));
        doc_number = 1 * el.parentElement.id.substr(3);
        if (docs_json[doc_number]){
          show_tree_in_tdiv(tdiv, doc_number, index);
        } else {
          try {
            console.log("loading doc" + doc_number + ".json.gz");
            const res = await fetch(docs_dir + "/doc" + doc_number + ".json.gz");
            let raw = await res.arrayBuffer();
            docs_json[doc_number] = JSON.parse(pako.inflate(raw, {to: "string"}));
            show_tree_in_tdiv(tdiv, doc_number, index);
          } catch(error) {
            if (! load_json_fail_reported) {
              load_json_fail_reported = true;
              alert("Cannot load " + docs_dir + "/doc" + doc_number + ".json.gz:\\n" + error);
            }
          }
        }
      } else {
        tree_div.remove();
        $('#button-'+sent_id).attr('title', 'show dependency tree '+el.dataset.id);
      }
    })
  );
}
'''

WRITE_HTML = udapi.block.write.html.Html()

[docs] class CorefHtml(BaseWriter): def __init__(self, docs_dir='docs', path_to_js='web', show_trees=True, show_eid=False, show_etype=False, colors=7, rtl=None, **kwargs): super().__init__(**kwargs) self.path_to_js = path_to_js self.show_trees = show_trees self.show_eid = show_eid self.show_etype = show_etype self.colors = colors self.rtl = rtl self.js_docs_dir = docs_dir self.docs_dir = docs_dir if self.path: new_dir, _ = os.path.split(self.path) self.docs_dir = os.path.join(new_dir, docs_dir) if docs_dir != '.' and not os.path.exists(self.docs_dir): os.makedirs(self.docs_dir) self._mention_ids = {} self._entity_colors = {} def _representative_word(self, entity): # return the first PROPN or NOUN. Or the most frequent one? heads = [m.head for m in entity.mentions] lemma_or_form = lambda n: n.lemma if n.lemma and n.lemma != '_' else n.form for upos in ('PROPN', 'NOUN'): nodes = [n for n in heads if n.upos == upos] if nodes: return lemma_or_form(nodes[0]) return lemma_or_form(heads[0])
[docs] def process_ud_doc(self, ud_doc, doc_num): print(f'<div class="doc" id="doc{doc_num}">') for tree in ud_doc: self.process_tree(tree) print('</div>')
[docs] def process_document(self, doc): ud_docs, doc_num, sent_id2doc = [], 0, {} for tree in doc.trees: if tree.newdoc or not ud_docs: ud_docs.append([]) doc_num += 1 ud_docs[-1].append(tree) sent_id2doc[tree.sent_id] = doc_num # TODO: use sent_id2doc print('<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8">') print('<title>Udapi CorefUD viewer</title>') if self.path_to_js == 'web': print('<script src="https://code.jquery.com/jquery-3.6.3.min.js"></script>') print('<script src="https://cdnjs.cloudflare.com/ajax/libs/pako/2.1.0/pako.min.js"></script>') if self.show_trees: print('<script src="https://cdn.rawgit.com/ufal/js-treex-view/gh-pages/js-treex-view.js"></script>') else: print(f'<script src="{self.path_to_js}/jquery-3.6.3.min.js"></script>') print(f'<script src="{self.path_to_js}/pako.min.js"></script>') if self.show_trees: print(f'<script src="{self.path_to_js}/js-treex-view.js"></script>') print('<style>' + CSS) for i, etype in enumerate(ETYPES): print(f'.{etype} {{background: hsl({int(i * 360/len(ETYPES))}, 80%, 85%);}}') print('.other {background: hsl(0, 0%, 85%);}') for i, htype in enumerate(HTYPES[:-1]): print(f'.htype.{htype} {{background: hsl({int(i * 360/len(HTYPES))}, 80%, 85%);}}') print('.htype.OTHER {background: hsl(0, 0%, 85%);}') if self.colors: for i in range(self.colors): print(f'.c{i} {{color: hsl({int(i * 360/self.colors)}, 100%, 30%);}}') if not self.show_eid: print('.eid {display: none;}') if not self.show_etype: print('.etype {display: none;}') print('</style>') print('</head>\n<body>\n<div id="wrap">') self._mention_ids = {} self._entity_colors = {} entities_of_type = Counter() for entity in doc.coref_entities: if self.colors: count = entities_of_type[entity.etype] entities_of_type[entity.etype] = count + 1 self._entity_colors[entity] = f'c{count % self.colors}' for idx, mention in enumerate(entity.mentions, 1): self._mention_ids[mention] = f'{_dom_esc(entity.eid)}e{idx}' print('<div id="overview">') print('<table><thead><tr><th title="entity id">eid</th>' '<th title="number of mentions">#m</th>' '<th title="a word best representing the entity">word</th></tr></thead>\n<tbody>') for entity in doc.coref_entities: print(f'<tr><td><a href="#{_dom_esc(entity.eid)}">{entity.eid}</a></td>' f'<td>{len(entity.mentions)}</td>' f'<td>{self._representative_word(entity)}</td></tr>') print('</tbody></table>') print('</div>') print('<div id="main">') print('<div id="main-menu">Show<br><div>\n' f' <input id="show-eid" type="checkbox" {"checked" if self.show_eid else ""} onclick="$(\'.eid\').toggle(this.checked);"><label for="show-eid">eid</label><br>\n' f' <input id="show-etype" type="checkbox" {"checked" if self.show_etype else ""} onclick="$(\'.etype\').toggle(this.checked);"><label for="show-etype">etype</label><br>\n' ' <input id="show-sent_id" type="checkbox" onclick="$(\'.sent_id\').toggle(this.checked);"><label for="show-sent_id">sent_id</label><br>\n' + (' <input id="show-trees" type="checkbox" checked onclick="$(\'.showtree\').toggle(this.checked);"><label for="show-trees">trees</label><br>\n' if self.show_trees else '') + ' <input id="show-color" type="checkbox" checked onclick="$(\'.m\').toggleClass(\'nocolor\',!this.checked);"><label for="show-color">colors</label><br>\n' ' <input id="show-boxes" type="checkbox" checked onclick="$(\'.m\').toggleClass(\'nobox\',!this.checked);"><label for="show-boxes">boxes</label></div><div>\n' ' <input id="show-norm" type="checkbox" checked onclick="$(\'.norm\').toggle(this.checked);"><label for="show-norm">non-mentions</label><br>\n' ' <input id="show-heads" type="checkbox" checked onclick="$(\'.head\').toggleClass(\'nobold\',!this.checked);"><label for="show-heads">heads in bold</label><br>\n' ' <input id="show-empty" type="checkbox" checked onclick="$(\'.empty\').toggle(this.checked);"><label for="show-empty">empty words</label><br>\n' ' <input id="show-breaks" type="checkbox" checked onclick="$(\'.sentence\').toggleClass(\'display-inline\',!this.checked);"><label for="show-breaks">sentence per line</label><br>\n' ' <input id="show-pars" type="checkbox" checked onclick="$(\'.par\').toggle(this.checked);"><label for="show-pars">paragraphs</label><br>\n' ' <input id="show-docs" type="checkbox" checked onclick="$(\'h1\').toggle(this.checked);"><label for="show-docs">document names</label><br>\n' '</div><fieldset onclick="$(\'.m\').toggleClass(\'htype\',$(\'#htype\')[0].checked)"><legend>bg color:</legend>\n' '<label><input type="radio" name="bgcolor" id="etype" checked>entity type</label>\n' '<label><input type="radio" name="bgcolor" id="htype">head upos</label>\n' '</fieldset>\n' '</div>\n' '<button id="menubtn" title="Visualization options" onclick="menuclick(this)"><div class="b1"></div><div class="b2"></div><div class="b3"></div></button>\n' ) # The first ud_doc will be printed to the main html file. self.process_ud_doc(ud_docs[0], 1) print('</div>') # id=main # Other ud_docs will be printed into separate files (so they can be loaded lazily) orig_stdout = sys.stdout try: for i, ud_doc in enumerate(ud_docs[1:], 2): sys.stdout = gzip.open(f"{self.docs_dir}/doc{i}.html.gz", 'wt') self.process_ud_doc(ud_doc, i) sys.stdout.close() finally: sys.stdout = orig_stdout print(f'<script>\nvar all_docs = {len(ud_docs)};\nvar docs_dir = "{self.js_docs_dir}";') print(SCRIPT_BASE) if self.show_trees: print('docs_json = [false, ', end='') # 1-based index, so dummy docs_json[0] WRITE_HTML.print_doc_json(ud_docs[0]) print('];') try: for i, ud_doc in enumerate(ud_docs[1:], 2): sys.stdout = gzip.open(f"{self.docs_dir}/doc{i}.json.gz", 'wt') WRITE_HTML.print_doc_json(ud_doc) sys.stdout.close() finally: sys.stdout = orig_stdout print(SCRIPT_SHOWTREE) print('$("#doc1 .sentence").each(add_show_tree_button);') print('</script>') print('</div></body></html>')
def _start_subspan(self, subspan, crossing=False): m = subspan.mention e = m.entity classes = f'{_dom_esc(e.eid)} {self._mention_ids[m]} {e.etype or "other"} m' title = f'eid={subspan.subspan_eid}\netype={e.etype}\nhead={m.head.form}' classes += f" {m.head.upos if m.head.upos in HTYPES else 'OTHER'}" title += f'\nhead-upos={m.head.upos}' if self.colors: classes += f' {self._entity_colors[e]}' if all(w.is_empty() for w in subspan.words): classes += ' empty' if len(e.mentions) == 1: classes += ' singleton' if crossing: classes += ' crossing' title += '\ncrossing' if m.other: title += f'\n{m.other}' span_id = '' if (subspan.subspan_id == '' or subspan.subspan_id.startswith('[1/')) and e.mentions[0] == m: span_id = f'id="{_dom_esc(e.eid)}" ' # The title should be always rendered left-to-right (e.g. "head=X", not "X=head"), # so for RTL languages, we need to use explicit dir="ltr" and insert a nested span with dir="rtl". if self.rtl: print(f'<span {span_id}class="{classes}" title="{title}" dir="ltr">' f'<span class="labels"><b class="eid">{_dom_esc(subspan.subspan_eid)}</b>' f' <i class="etype">{e.etype}</i></span><span dir="rtl">', end='') else: print(f'<span {span_id}class="{classes}" title="{title}">' f'<span class="labels"><b class="eid">{_dom_esc(subspan.subspan_eid)}</b>' f' <i class="etype">{e.etype}</i></span>', end='')
[docs] def process_tree(self, tree): mentions = set() nodes_and_empty = tree.descendants_and_empty for node in nodes_and_empty: for m in node.coref_mentions: mentions.add(m) subspans = [] for mention in mentions: subspans.extend(mention._subspans()) subspans.sort(reverse=True) if tree.newdoc: print(f'<hr><h1>{tree.newdoc if tree.newdoc is not True else ""}</h1><hr>') elif tree.newpar: print('<hr class="par">') opened, prev_node_mention = [], True rtl = ' dir="rtl"' if self.rtl else "" print(f'<p class="sentence" data-id="{tree.sent_id}" id="{_id(tree)}"{rtl}>') for node in nodes_and_empty: if not prev_node_mention and subspans and subspans[-1].words[0] == node: print('</span>', end='') while subspans and subspans[-1].words[0] == node: subspan = subspans.pop() self._start_subspan(subspan) opened.append(subspan) if not opened and prev_node_mention: print('<span class="norm">', end='') prev_node_mention = True if opened else False is_head = self._is_head(node) if is_head: print('<b class="head">', end='') if node.is_empty(): print('<i class="empty">', end='') print(node.form, end='') if node.is_empty(): print('</i>', end='') if is_head: print('</b>', end='') while opened and opened[-1].words[-1] == node: if self.rtl: print('</span></span>', end='') else: print('</span>', end='') opened.pop() # Two mentions are crossing iff their spans have non-zero intersection, # but neither is a subset of the other, e.g. (e1 ... (e2 ... e1) ... e2). # Let's visualize this (simplified) as # <span class=e1>...<span class=e2>...</span></span><span class="e2 crossing">...</span> # i.e. let's split mention e2 into two subspans which are next to each other. # Unfortunatelly, we cannot mark now both crossing mentions using html class "crossing" # (opening tags are already printed), so we'll mark only the second part of the second mention. endings = [x for x in opened if x.words[-1] == node] if endings: new_opened, brokens, found_crossing = [], [], False for subspan in opened: if subspan.words[-1] == node: found_crossing = True elif found_crossing: brokens.append(subspan) else: new_opened.append(subspan) opened = new_opened print('</span>' * (len(endings) + len(brokens)), end='') for broken in brokens: self._start_subspan(broken, True) opened.append(subspan) if not node.no_space_after: print(' ', end='') if not prev_node_mention: print('</span>', end='') print('</p>')
def _is_head(self, node): for mention in node.coref_mentions: if mention.head == node: return mention return None
# id needs to be a valid DOM querySelector # so it cannot contain [#./:] and maybe more, # so let's substitute all [^\w\d-] to be on the safe side. # DOM IDs cannot start with a digit, so prepend e.g. "n" if needed. def _dom_esc(string): if string[0].isdecimal(): string = 'n' + string return re.sub(r'[^\w\d-]', '_', string) def _id(node): if node is None: return 'null' return _dom_esc(node.address()) def _esc(string): if string is None: string = '' return string.replace('\\', '\\\\').replace('"', r'\"')