Source code for udapi.block.read.oldcorefud

"""Reader for CoNLL-U files with the old CorefUD 0.1 style of coreference annotation."""
import re
import logging
import udapi.block.read.conllu
from udapi.core.coref import CorefEntity, CorefMention, BridgingLinks


[docs]
class OldCorefUD(udapi.block.read.conllu.Conllu):

    def __init__(self, replace_hyphen_in_id_with='', **kwargs):
        """Create the read.OldCorefUD reader object.

        Args:
        substitute_hyphen_in_id_for: string to use as a replacement for hyphens in ClusterId
          The new format does not allow hyphens in eid (IDs of entity entities),
          so we need to replace them.
        """
        super().__init__(**kwargs)
        self.replace_hyphen_in_id_with = replace_hyphen_in_id_with
        self.orig2new = {}
        self.new2orig = {}

    def _fix_id(self, cid):
        if not cid or '-' not in cid:
            return cid
        new_cid = self.orig2new.get(cid)
        if new_cid is None:
            new_cid = cid.replace('-', self.replace_hyphen_in_id_with)
            base, counter = new_cid, 1
            while new_cid in self.new2orig:
                counter += 1
                new_cid = f"{base}{counter}"
            self.new2orig[new_cid] = cid
            self.orig2new[cid] = new_cid
        return new_cid


[docs]
    def process_document(self, doc, strict=True):
        super().process_document(doc)

        eid_to_entity = {}
        for node in doc.nodes_and_empty:
            index, index_str = 0, ""
            eid = node.misc["ClusterId"]
            if not eid:
                index, index_str = 1, "[1]"
                eid = node.misc["ClusterId[1]"]
            eid = self._fix_id(eid)
            while eid:
                entity = eid_to_entity.get(eid)
                if entity is None:
                    entity = CorefEntity(eid)
                    eid_to_entity[eid] = entity
                mention = CorefMention(words=[node], entity=entity)
                if node.misc["MentionSpan" + index_str]:
                    mention.span = node.misc["MentionSpan" + index_str]
                etype = node.misc["ClusterType" + index_str]
                if etype:
                    if entity.etype is not None and etype != entity.etype:
                        logging.warning(f"etype mismatch in {node}: {entity.etype} != {etype}")
                    entity.etype = etype

                bridging_str = node.misc["Bridging" + index_str]
                if bridging_str:
                    mention._bridging = BridgingLinks(mention)
                    for link_str in bridging_str.split(','):
                        target, relation = link_str.split(':')
                        target = self._fix_id(target)
                        if target == eid:
                            _error("Bridging cannot self-reference the same entity: " + target, strict)
                        if target not in eid_to_entity:
                            eid_to_entity[target] = CorefEntity(target)
                        mention._bridging.append((eid_to_entity[target], relation))

                split_ante_str = node.misc["SplitAnte" + index_str]
                if split_ante_str:
                    split_antes = []
                    # TODO in CorefUD draft "+" was used as the separator, but it was changed to comma.
                    # We can delete `.replace('+', ',')` once there are no more data with the legacy plus separator.
                    for ante_str in split_ante_str.replace('+', ',').split(','):
                        ante_str = self._fix_id(ante_str)
                        if ante_str in eid_to_entity:
                            if ante_str == eid:
                                _error("SplitAnte cannot self-reference the same entity: " + eid, strict)
                            split_antes.append(eid_to_entity[ante_str])
                        else:
                            # split cataphora, e.g. "We, that is you and me..."
                            ante_cl = CorefEntity(ante_str)
                            eid_to_entity[ante_str] = ante_cl
                            split_antes.append(ante_cl)
                    entity.split_ante = sorted(split_antes)

                # Some CorefUD 0.2 datasets (e.g. ARRAU) separate key-value pairs with spaces instead of commas.
                # We also need to escape forbidden characters.
                mmisc = node.misc["MentionMisc" + index_str].replace(' ', ',')
                mention.other = mmisc.replace('-', '%2D').replace('(', '%28').replace(')', '%29')
                index += 1
                index_str = f"[{index}]"
                eid = self._fix_id(node.misc["ClusterId" + index_str])
        # c=doc.coref_entities should be sorted, so that c[0] < c[1] etc.
        # In other words, the dict should be sorted by the values (according to CorefEntity.__lt__),
        # not by the keys (eid).
        # In Python 3.7+ (3.6+ in CPython), dicts are guaranteed to be insertion order.
        for entity in eid_to_entity.values():
            if not entity._mentions:
                _error(f"Entity {entity.eid} referenced in SplitAnte or Bridging, but not defined with ClusterId", strict)
            entity._mentions.sort()
        doc._eid_to_entity = {c._eid: c for c in sorted(eid_to_entity.values())}

        # Delete all old-style attributes from MISC (so when converting old to new style, the old attributes are deleted).
        attrs = "ClusterId MentionSpan ClusterType Bridging SplitAnte MentionMisc".split()
        for node in doc.nodes_and_empty:
            for key in list(node.misc):
                if any(re.match(attr + r'(\[\d+\])?$', key) for attr in attrs):
                    del node.misc[key]




def _error(msg, strict):
    if strict:
        raise ValueError(msg)
    logging.error(msg)