import re
import logging
from collections import defaultdict
from udapi.core.block import Block
[docs]
class Gum2CorefUD(Block):
[docs]
def process_tree(self, tree):
docname = tree.bundle.document.meta['docname'] + '_'
eid_to_entity = tree.bundle.document._eid_to_entity
unfinished_mentions = defaultdict(list)
for node in tree.descendants:
misc_entity = node.misc['Entity']
if not misc_entity:
continue
# Attribute Entity may contain multiple entities, e.g.
# Entity=(abstract-7-new-2-coref(abstract-3-giv:act-1-coref)
# means a start of entity id=7 and start&end (i.e. single-word mention) of entity id=3.
# The following re.split line splits this into
# entities = ["(abstract-7-new-2-coref", "(abstract-3-giv:act-1-coref)"]
entities = [x for x in re.split('(\([^()]+\)?|[^()]+\))', misc_entity) if x]
for entity in entities:
# GUM 2.9 uses global.Entity = entity-GRP-infstat-MIN-coref_type-identity
# but the closing tag is shortent just to GRP.
opening, closing = (entity[0] == '(', entity[-1] == ')')
entity = entity.strip('()')
if not opening and not closing:
logging.warning(f"Entity {entity} at {node} has no opening nor closing bracket.")
elif not opening and closing:
name = docname + entity
if not unfinished_mentions[name]:
raise ValueError(f"Mention {name} closed at {node}, but not opened in the same tree.")
else:
mention = unfinished_mentions[name].pop()
mention.span = f'{mention.head.ord}-{node.ord}'
else:
attrs = entity.split('-')
if len(attrs) == 6:
etype, grp, infstat, minspan, ctype, wiki = attrs
elif len(attrs) == 5:
wiki = None
etype, grp, infstat, minspan, ctype = attrs
elif len(attrs) > 6:
logging.warning(f"Entity {entity} at {node} has more than 6 attributes.")
etype, grp, infstat, minspan, ctype, wiki = entity.split('-', maxsplit=5)
else:
raise ValueError(f"Less than 5 attributes in {entity} at {node}")
name = docname + grp
entity = eid_to_entity.get(name)
if entity is None:
entity = node.create_coref_entity(eid=name, etype=etype)
mention = entity.mentions[0]
mention.misc = f"Infstat:{infstat},MinSpan:{minspan},CorefType:{ctype}"
if wiki:
mention.misc += ',Wikification:' + wiki #.replace(',', '%2C')
else:
mention = entity.create_mention(head=node)
if closing:
mention.words = [node]
else:
unfinished_mentions[name].append(mention)
del node.misc['Entity']
misc_bridges = node.misc['Bridge']
if misc_bridges:
# E.g. Entity=event-12|Bridge=12<124,12<125
for misc_bridge in misc_bridges.split(','):
try:
trg_str, src_str = [docname + grp for grp in misc_bridge.split('<')]
except ValueError as err:
raise ValueError(f"{node}: {misc_bridge} {err}")
try:
trg_entity = eid_to_entity[trg_str]
src_entity = eid_to_entity[src_str]
except KeyError as err:
logging.warning(f"{node}: Cannot find entity {err}")
else:
mention = src_entity.mentions[-1]
# TODO: what relation should we choose for Bridging?
# relation = f"{src_str.split('-')[0]}-{trg_str.split('-')[0]}"
relation = '_'
mention.bridging.append((trg_entity, relation))
del node.misc['Bridge']
misc_split = node.misc['Split']
if misc_split:
# E.g. Entity=(person-54)|Split=4<54,9<54
src_str = docname + misc_split.split('<')[-1]
ante_entities = []
for x in misc_split.split(','):
ante_str, this_str = [docname + grp for grp in x.split('<')]
if this_str != src_str:
raise ValueError(f'{node} invalid Split: {this_str} != {src_str}')
# logging.warning
# There are just three such cases in GUM and all are bugs,
# so let's ignore them entirely (the `else` clause will be skipped if exiting `for` w/ `break`).
# break
ante_entities.append(eid_to_entity[ante_str])
else:
eid_to_entity[src_str].split_ante = ante_entities
del node.misc['Split']
for entity_name, mentions in unfinished_mentions.items():
for mention in mentions:
logging.warning(f"Mention {name} opened at {mention.head}, but not closed in the same tree. Deleting.")
entity = mention.entity
mention.words = []
entity._mentions.remove(mention)
if not entity._mentions:
del eid_to_entity[name]