Source code for udapi.block.write.oldcorefud

"""Writer for CoNLL-U files with the old CorefUD 0.1 style of coreference annotation."""
import re
import logging
import udapi.block.write.conllu


[docs]
class OldCorefUD(udapi.block.write.conllu.Conllu):


[docs]
    def process_document(self, doc):
        if not doc.coref_entities:
            logging.warning("Using write.OldCorefUD on a document without any coreference annotation")

        # Delete both new-style (GUM-style) and old-style (CorefUD 0.1) coreference annotations from MISC.
        attrs = "Entity Split Bridge ClusterId MentionSpan ClusterType Bridging SplitAnte MentionMisc".split()
        for node in doc.nodes_and_empty:
            for key in list(node.misc):
                if any(re.match(attr + r'(\[\d+\])?$', key) for attr in attrs):
                    del node.misc[key]
        del doc.meta['global.Entity']

        # doc._eid_to_entity is a dict, which is insertion ordered in Python 3.7+.
        # The insertion order is sorted according to CorefEntity.__lt__ (see few lines above).
        # However, new entities could be added meanwhile or some entities edited,
        # so we need to sort the entities again before storing to MISC.
        # We also need to mare sure entity.mentions are sorted in each entity
        # because the ordering of entities is defined by the first mention in each entity.
        # Ordering of mentions within a entity can be changed when e.g. changing the span
        # of a given mention or reordering words within a sentence and in such events
        # Udapi currently does not automatically update the ordering of entities.
        for entity in doc.coref_entities:
            entity._mentions.sort()
        for entity in sorted(doc.coref_entities):
            for mention in entity.mentions:
                head = mention.head
                if head.misc["ClusterId"]:
                    for a in attrs:
                        if head.misc[a]:
                            head.misc[a + "[1]"] = head.misc[a]
                            del head.misc[a]
                    index_str = "[2]"
                else:
                    index, index_str = 1, "[1]"
                    while(head.misc["ClusterId" + index_str]):
                        index += 1
                        index_str = f"[{index}]"
                    if index == 1:
                        index_str = ""
                head.misc["ClusterId" + index_str] = entity.eid
                head.misc["MentionSpan" + index_str] = mention.span
                head.misc["ClusterType" + index_str] = entity.etype
                if mention._bridging:
                    head.misc["Bridging" + index_str] = ','.join(f'{l.target.eid}:{l.relation}' for l in sorted(mention.bridging))
                if entity.split_ante:
                    serialized = ','.join((c.eid for c in sorted(entity.split_ante)))
                    head.misc["SplitAnte" + index_str] = serialized
                if mention.other:
                    head.misc["MentionMisc" + index_str] = str(mention.other).replace('%2D', '-')

        super().process_document(doc)