Source code for udapi.block.write.oldcorefud

"""Writer for CoNLL-U files with the old CorefUD 0.1 style of coreference annotation."""
import re
import logging
import udapi.block.write.conllu

[docs] class OldCorefUD(udapi.block.write.conllu.Conllu):
[docs] def process_document(self, doc): if not doc.coref_entities: logging.warning("Using write.OldCorefUD on a document without any coreference annotation") # Delete both new-style (GUM-style) and old-style (CorefUD 0.1) coreference annotations from MISC. attrs = "Entity Split Bridge ClusterId MentionSpan ClusterType Bridging SplitAnte MentionMisc".split() for node in doc.nodes_and_empty: for key in list(node.misc): if any(re.match(attr + r'(\[\d+\])?$', key) for attr in attrs): del node.misc[key] del doc.meta['global.Entity'] # doc._eid_to_entity is a dict, which is insertion ordered in Python 3.7+. # The insertion order is sorted according to CorefEntity.__lt__ (see few lines above). # However, new entities could be added meanwhile or some entities edited, # so we need to sort the entities again before storing to MISC. # We also need to mare sure entity.mentions are sorted in each entity # because the ordering of entities is defined by the first mention in each entity. # Ordering of mentions within a entity can be changed when e.g. changing the span # of a given mention or reordering words within a sentence and in such events # Udapi currently does not automatically update the ordering of entities. for entity in doc.coref_entities: entity._mentions.sort() for entity in sorted(doc.coref_entities): for mention in entity.mentions: head = mention.head if head.misc["ClusterId"]: for a in attrs: if head.misc[a]: head.misc[a + "[1]"] = head.misc[a] del head.misc[a] index_str = "[2]" else: index, index_str = 1, "[1]" while(head.misc["ClusterId" + index_str]): index += 1 index_str = f"[{index}]" if index == 1: index_str = "" head.misc["ClusterId" + index_str] = entity.eid head.misc["MentionSpan" + index_str] = mention.span head.misc["ClusterType" + index_str] = entity.etype if mention._bridging: head.misc["Bridging" + index_str] = ','.join(f'{l.target.eid}:{l.relation}' for l in sorted(mention.bridging)) if entity.split_ante: serialized = ','.join((c.eid for c in sorted(entity.split_ante))) head.misc["SplitAnte" + index_str] = serialized if mention.other: head.misc["MentionMisc" + index_str] = str(mention.other).replace('%2D', '-') super().process_document(doc)