Source code for udapi.block.corefud.link2cluster

import logging
from udapi.core.block import Block

[docs] class Link2Cluster(Block): """Block corefud.Link2Cluster converts link-based coreference annotation to the (cluster-based) CorefUD format.""" def __init__(self, id_attr='proiel-id', ante_attr='antecedent-proiel-id', delete_orig_attrs=True, **kwargs): super().__init__(**kwargs) self.id_attr = id_attr self.ante_attr = ante_attr self.delete_orig_attrs = delete_orig_attrs
[docs] def process_document(self, doc): id2node = {} links = [] for node in doc.nodes_and_empty: this_id = node.misc[self.id_attr] if this_id != '': id2node[this_id] = node ante_id = node.misc[self.ante_attr] if ante_id != '': if ante_id == this_id: logging.warning(f"{node} has a self-reference {self.ante_attr}={ante_id}") else: links.append([ante_id, this_id]) if self.delete_orig_attrs: for attr in (self.id_attr, self.ante_attr): del node.misc[attr] # It seems faster&simpler to process the links in any order and implement entity merging, # rather than trying to sort the links so that no entity merging is needed. for ante_id, this_id in links: if ante_id not in id2node: logging.warning(f"{ante_id} is referenced in {self.ante_attr}, but not in {self.id_attr}") else: ante_node, this_node = id2node[ante_id], id2node[this_id] if not this_node.coref_mentions and not ante_node.coref_mentions: # None of the nodes is part of any mention/entity. Let's create them. entity = this_node.root.document.create_coref_entity() m_ante = entity.create_mention(head=ante_node, words=[ante_node]) m_this = entity.create_mention(head=this_node, words=[this_node]) for node, mention in ((ante_node, m_ante), (this_node, m_this)): if node.misc['information-status']: mention.other['infstat'] = node.misc['information-status'] if self.delete_orig_attrs: del node.misc['information-status'] elif this_node.coref_mentions and ante_node.coref_mentions: # Both of the nodes are part of mentions in different entities. # Let's merge the two entities (i.e. "steal" all mentions from the "ante" entity to "this" entity). # While the official API supports "stealing" a single mention (m.entity = another_entity), # the implementation below using _mentions and _entity is a bit faster. e_ante, e_this = this_node.coref_entities[0], ante_node.coref_entities[0] assert e_ante != e_this for mention in e_ante.mentions: mention._entity = e_this e_this._mentions.extend(e_ante.mentions) e_this._mentions.sort() e_ante._mentions.clear() else: # Only one of the nodes is part of an entity. Let's add the second one to this entity. if ante_node.coref_mentions: ante_node.coref_entities[0].create_mention(head=this_node, words=[this_node]) else: this_node.coref_entities[0].create_mention(head=ante_node, words=[ante_node])