import logging
from udapi.core.block import Block
[docs]
class Link2Cluster(Block):
"""Block corefud.Link2Cluster converts link-based coreference annotation to the (cluster-based) CorefUD format.
Params:
id_attr: name of the attribute in MISC that stores the original-format IDs of nodes
ante_attr: name of the attribute in MISC that stores the ID of the antecedent
of the current node (in the same format as `id_attr`).
delete_orig_attrs: Should we delete the MISC attributes that were used for the conversion?
(i.e. id_attr and ante_attr, plus possibly also infstat_attr, coreftype_attr,
bridge_attr, bridge_relation_attr if these are used). Default=True.
infstat_attr: name of the attribute in MISC that stores the information status of a given mention
Will be stored in `mention.other['infstat']`. Use None for ignoring this.
coreftype_attr: name of the attribute in MISC that stores the coreference type of a given mention
Will be stored in `mention.other['coreftype']`. Use None for ignoring this.
bridge_attr: name of the attribute in MISC that stores the ID of the bridging antecedent
of the current node/mention (in the same format as `id_attr`).
Default=None, i.e. ignore this parameter.
bridge_relation_attr: name of the attribute in MISC that stores the bridging relation type
(e.g. "part" or "subset"). Default=None, i.e. ignore this parameter.
eid_counter: use a global counter of entity.eid and start with a given number. Default=1.
The main goal of this parameter is to make eid unique across multiple documents.
If you use eid_counter=0, this feature will be turned off,
so entities will be created using `root.document.create_coref_entity()`,
with no eid parameter, so that the eid will start from "e1" in each document processed by this block.
"""
def __init__(self, id_attr='proiel-id', ante_attr='antecedent-proiel-id', delete_orig_attrs=True,
infstat_attr='information-status', coreftype_attr='coreftype',
bridge_attr=None, bridge_relation_attr=None, eid_counter=1, **kwargs):
super().__init__(**kwargs)
self.id_attr = id_attr
self.ante_attr = ante_attr
self.delete_orig_attrs = delete_orig_attrs
self.infstat_attr = infstat_attr
self.coreftype_attr = coreftype_attr
self.bridge_attr = bridge_attr
self.bridge_relation_attr = bridge_relation_attr
self.eid_counter = int(eid_counter)
def _new_entity(self, doc):
if not self.eid_counter:
return doc.create_coref_entity()
entity = doc.create_coref_entity(eid=f"e{self.eid_counter}")
self.eid_counter += 1
return entity
def _new_mention(self, entity, node):
mention = entity.create_mention(head=node, words=[node])
if self.infstat_attr and node.misc[self.infstat_attr]:
mention.other['infstat'] = node.misc[self.infstat_attr]
if self.delete_orig_attrs:
del node.misc[self.infstat_attr]
if self.coreftype_attr and node.misc[self.coreftype_attr]:
mention.other['coreftype'] = node.misc[self.coreftype_attr]
if self.delete_orig_attrs:
del node.misc[self.coreftype_attr]
return mention
[docs]
def process_document(self, doc):
id2node = {}
links = []
bridges = []
for node in doc.nodes_and_empty:
this_id = node.misc[self.id_attr]
if this_id != '':
id2node[this_id] = node
ante_id = node.misc[self.ante_attr]
if ante_id != '':
if ante_id == this_id:
logging.warning(f"{node} has a self-reference {self.ante_attr}={ante_id}")
else:
links.append([ante_id, this_id])
if self.delete_orig_attrs:
for attr in (self.id_attr, self.ante_attr):
del node.misc[attr]
if self.bridge_attr:
bridge_id = node.misc[self.bridge_attr]
if bridge_id != '':
if bridge_id == this_id:
logging.warning(f"{node} has a self-reference bridging {self.bridge_attr}={bridge_id}")
else:
bridges.append([bridge_id, this_id, node.misc[self.bridge_relation_attr]])
if self.delete_orig_attrs:
for attr in (self.bridge_attr, self.bridge_relation_attr):
del node.misc[attr]
# It seems faster&simpler to process the links in any order and implement entity merging,
# rather than trying to sort the links so that no entity merging is needed.
for ante_id, this_id in links:
if ante_id not in id2node:
logging.warning(f"{ante_id} is referenced in {self.ante_attr}, but not in {self.id_attr}")
else:
ante_node, this_node = id2node[ante_id], id2node[this_id]
if not this_node.coref_mentions and not ante_node.coref_mentions:
# None of the nodes is part of any mention/entity. Let's create them.
entity = self._new_entity(this_node.root.document)
self._new_mention(entity, ante_node)
self._new_mention(entity, this_node)
elif this_node.coref_mentions and ante_node.coref_mentions:
# Both of the nodes are part of mentions in different entities.
# Let's merge the two entities (i.e. "steal" all mentions from the "ante" entity to "this" entity).
# While the official API supports "stealing" a single mention (m.entity = another_entity),
# the implementation below using _mentions and _entity is a bit faster.
e_ante, e_this = this_node.coref_entities[0], ante_node.coref_entities[0]
assert e_ante != e_this
for mention in e_ante.mentions:
mention._entity = e_this
e_this._mentions.extend(e_ante.mentions)
e_this._mentions.sort()
e_ante._mentions.clear()
else:
# Only one of the nodes is part of an entity. Let's add the second one to this entity.
if ante_node.coref_mentions:
self._new_mention(ante_node.coref_entities[0], this_node)
else:
self._new_mention(this_node.coref_entities[0], ante_node)
# Bridging
for ante_id, this_id, relation in bridges:
if ante_id not in id2node:
logging.warning(f"{ante_id} is referenced in {self.bridge_attr}, but not in {self.id_attr}")
else:
ante_node, this_node = id2node[ante_id], id2node[this_id]
if ante_node.coref_mentions:
m_ante = next(m for m in ante_node.coref_mentions if m.head is ante_node)
e_ante = m_ante.entity
else:
e_ante = self._new_entity(ante_node.root.document)
m_ante = self._new_mention(e_ante, ante_node)
if this_node.coref_mentions:
m_this = next(m for m in this_node.coref_mentions if m.head is this_node)
else:
e_this = self._new_entity(this_node.root.document)
m_this = self._new_mention(e_this, this_node)
m_this.bridging.append((e_ante, relation))