Source code for udapi.block.corefud.gum2corefud

import re
import logging
from collections import defaultdict
from udapi.core.block import Block

[docs] class Gum2CorefUD(Block):
[docs] def process_tree(self, tree): docname = tree.bundle.document.meta['docname'] + '_' eid_to_entity = tree.bundle.document._eid_to_entity unfinished_mentions = defaultdict(list) for node in tree.descendants: misc_entity = node.misc['Entity'] if not misc_entity: continue # Attribute Entity may contain multiple entities, e.g. # Entity=(abstract-7-new-2-coref(abstract-3-giv:act-1-coref) # means a start of entity id=7 and start&end (i.e. single-word mention) of entity id=3. # The following re.split line splits this into # entities = ["(abstract-7-new-2-coref", "(abstract-3-giv:act-1-coref)"] entities = [x for x in re.split('(\([^()]+\)?|[^()]+\))', misc_entity) if x] for entity in entities: # GUM 2.9 uses global.Entity = entity-GRP-infstat-MIN-coref_type-identity # but the closing tag is shortent just to GRP. opening, closing = (entity[0] == '(', entity[-1] == ')') entity = entity.strip('()') if not opening and not closing: logging.warning(f"Entity {entity} at {node} has no opening nor closing bracket.") elif not opening and closing: name = docname + entity if not unfinished_mentions[name]: raise ValueError(f"Mention {name} closed at {node}, but not opened in the same tree.") else: mention = unfinished_mentions[name].pop() mention.span = f'{mention.head.ord}-{node.ord}' else: attrs = entity.split('-') if len(attrs) == 6: etype, grp, infstat, minspan, ctype, wiki = attrs elif len(attrs) == 5: wiki = None etype, grp, infstat, minspan, ctype = attrs elif len(attrs) > 6: logging.warning(f"Entity {entity} at {node} has more than 6 attributes.") etype, grp, infstat, minspan, ctype, wiki = entity.split('-', maxsplit=5) else: raise ValueError(f"Less than 5 attributes in {entity} at {node}") name = docname + grp entity = eid_to_entity.get(name) if entity is None: entity = node.create_coref_entity(eid=name, etype=etype) mention = entity.mentions[0] mention.misc = f"Infstat:{infstat},MinSpan:{minspan},CorefType:{ctype}" if wiki: mention.misc += ',Wikification:' + wiki #.replace(',', '%2C') else: mention = entity.create_mention(head=node) if closing: mention.words = [node] else: unfinished_mentions[name].append(mention) del node.misc['Entity'] misc_bridges = node.misc['Bridge'] if misc_bridges: # E.g. Entity=event-12|Bridge=12<124,12<125 for misc_bridge in misc_bridges.split(','): try: trg_str, src_str = [docname + grp for grp in misc_bridge.split('<')] except ValueError as err: raise ValueError(f"{node}: {misc_bridge} {err}") try: trg_entity = eid_to_entity[trg_str] src_entity = eid_to_entity[src_str] except KeyError as err: logging.warning(f"{node}: Cannot find entity {err}") else: mention = src_entity.mentions[-1] # TODO: what relation should we choose for Bridging? # relation = f"{src_str.split('-')[0]}-{trg_str.split('-')[0]}" relation = '_' mention.bridging.append((trg_entity, relation)) del node.misc['Bridge'] misc_split = node.misc['Split'] if misc_split: # E.g. Entity=(person-54)|Split=4<54,9<54 src_str = docname + misc_split.split('<')[-1] ante_entities = [] for x in misc_split.split(','): ante_str, this_str = [docname + grp for grp in x.split('<')] if this_str != src_str: raise ValueError(f'{node} invalid Split: {this_str} != {src_str}') # logging.warning # There are just three such cases in GUM and all are bugs, # so let's ignore them entirely (the `else` clause will be skipped if exiting `for` w/ `break`). # break ante_entities.append(eid_to_entity[ante_str]) else: eid_to_entity[src_str].split_ante = ante_entities del node.misc['Split'] for entity_name, mentions in unfinished_mentions.items(): for mention in mentions: logging.warning(f"Mention {name} opened at {mention.head}, but not closed in the same tree. Deleting.") entity = mention.entity mention.words = [] entity._mentions.remove(mention) if not entity._mentions: del eid_to_entity[name]