Source code for udapi.block.corefud.link2cluster

import logging
from udapi.core.block import Block


[docs]
class Link2Cluster(Block):
    """Block corefud.Link2Cluster converts link-based coreference annotation to the (cluster-based) CorefUD format.

    Params:
    id_attr: name of the attribute in MISC that stores the original-format IDs of nodes
    ante_attr: name of the attribute in MISC that stores the ID of the antecedent
        of the current node (in the same format as `id_attr`).
    delete_orig_attrs: Should we delete the MISC attributes that were used for the conversion?
        (i.e. id_attr and ante_attr, plus possibly also infstat_attr, coreftype_attr,
        bridge_attr, bridge_relation_attr if these are used). Default=True.
    infstat_attr: name of the attribute in MISC that stores the information status of a given mention
        Will be stored in `mention.other['infstat']`. Use None for ignoring this.
    coreftype_attr: name of the attribute in MISC that stores the coreference type of a given mention
        Will be stored in `mention.other['coreftype']`. Use None for ignoring this.
    bridge_attr: name of the attribute in MISC that stores the ID of the bridging antecedent
        of the current node/mention (in the same format as `id_attr`).
        Default=None, i.e. ignore this parameter.
    bridge_relation_attr:  name of the attribute in MISC that stores the bridging relation type
        (e.g. "part" or "subset"). Default=None, i.e. ignore this parameter.
    eid_counter: use a global counter of entity.eid and start with a given number. Default=1.
        The main goal of this parameter is to make eid unique across multiple documents.
        If you use eid_counter=0, this feature will be turned off,
        so entities will be created using `root.document.create_coref_entity()`,
        with no eid parameter, so that the eid will start from "e1" in each document processed by this block.
    """
    def __init__(self, id_attr='proiel-id', ante_attr='antecedent-proiel-id', delete_orig_attrs=True,
                 infstat_attr='information-status', coreftype_attr='coreftype',
                 bridge_attr=None, bridge_relation_attr=None, eid_counter=1, **kwargs):
        super().__init__(**kwargs)
        self.id_attr = id_attr
        self.ante_attr = ante_attr
        self.delete_orig_attrs = delete_orig_attrs
        self.infstat_attr = infstat_attr
        self.coreftype_attr = coreftype_attr
        self.bridge_attr = bridge_attr
        self.bridge_relation_attr = bridge_relation_attr
        self.eid_counter = int(eid_counter)

    def _new_entity(self, doc):
        if not self.eid_counter:
            return doc.create_coref_entity()
        entity = doc.create_coref_entity(eid=f"e{self.eid_counter}")
        self.eid_counter += 1
        return entity

    def _new_mention(self, entity, node):
        mention = entity.create_mention(head=node, words=[node])
        if self.infstat_attr and node.misc[self.infstat_attr]:
            mention.other['infstat'] = node.misc[self.infstat_attr]
            if self.delete_orig_attrs:
                del node.misc[self.infstat_attr]
        if self.coreftype_attr and node.misc[self.coreftype_attr]:
            mention.other['coreftype'] = node.misc[self.coreftype_attr]
            if self.delete_orig_attrs:
                del node.misc[self.coreftype_attr]
        return mention


[docs]
    def process_document(self, doc):
        id2node = {}
        links = []
        bridges = []
        for node in doc.nodes_and_empty:
            this_id = node.misc[self.id_attr]
            if this_id != '':
                id2node[this_id] = node
                ante_id = node.misc[self.ante_attr]
                if ante_id != '':
                    if ante_id == this_id:
                        logging.warning(f"{node} has a self-reference {self.ante_attr}={ante_id}")
                    else:
                        links.append([ante_id, this_id])
                if self.delete_orig_attrs:
                    for attr in (self.id_attr, self.ante_attr):
                        del node.misc[attr]
                if self.bridge_attr:
                    bridge_id = node.misc[self.bridge_attr]
                    if bridge_id != '':
                        if bridge_id == this_id:
                            logging.warning(f"{node} has a self-reference bridging {self.bridge_attr}={bridge_id}")
                        else:
                            bridges.append([bridge_id, this_id, node.misc[self.bridge_relation_attr]])
                        if self.delete_orig_attrs:
                            for attr in (self.bridge_attr, self.bridge_relation_attr):
                                del node.misc[attr]

        # It seems faster&simpler to process the links in any order and implement entity merging,
        # rather than trying to sort the links so that no entity merging is needed.
        for ante_id, this_id in links:
            if ante_id not in id2node:
                logging.warning(f"{ante_id} is referenced in {self.ante_attr}, but not in {self.id_attr}")
            else:
                ante_node, this_node = id2node[ante_id], id2node[this_id]
                if not this_node.coref_mentions and not ante_node.coref_mentions:
                    # None of the nodes is part of any mention/entity. Let's create them.
                    entity = self._new_entity(this_node.root.document)
                    self._new_mention(entity, ante_node)
                    self._new_mention(entity, this_node)
                elif this_node.coref_mentions and ante_node.coref_mentions:
                    # Both of the nodes are part of mentions in different entities.
                    # Let's merge the two entities (i.e. "steal" all mentions from the "ante" entity to "this" entity).
                    # While the official API supports "stealing" a single mention (m.entity = another_entity),
                    # the implementation below using _mentions and _entity is a bit faster.
                    e_ante, e_this = this_node.coref_entities[0], ante_node.coref_entities[0]
                    assert e_ante != e_this
                    for mention in e_ante.mentions:
                        mention._entity = e_this
                    e_this._mentions.extend(e_ante.mentions)
                    e_this._mentions.sort()
                    e_ante._mentions.clear()
                else:
                    # Only one of the nodes is part of an entity. Let's add the second one to this entity.
                    if ante_node.coref_mentions:
                        self._new_mention(ante_node.coref_entities[0], this_node)
                    else:
                        self._new_mention(this_node.coref_entities[0], ante_node)

        # Bridging
        for ante_id, this_id, relation in bridges:
            if ante_id not in id2node:
                logging.warning(f"{ante_id} is referenced in {self.bridge_attr}, but not in {self.id_attr}")
            else:
                ante_node, this_node = id2node[ante_id], id2node[this_id]
                if ante_node.coref_mentions:
                    m_ante = next(m for m in ante_node.coref_mentions if m.head is ante_node)
                    e_ante = m_ante.entity
                else:
                    e_ante = self._new_entity(ante_node.root.document)
                    m_ante = self._new_mention(e_ante, ante_node)
                if this_node.coref_mentions:
                    m_this = next(m for m in this_node.coref_mentions if m.head is this_node)
                else:
                    e_this = self._new_entity(this_node.root.document)
                    m_this = self._new_mention(e_this, this_node)
                m_this.bridging.append((e_ante, relation))