Source code for udapi.block.corefud.movehead

import logging
from collections import Counter
from udapi.core.block import Block
from udapi.core.node import find_minimal_common_treelet

[docs] class MoveHead(Block): """Block corefud.MoveHead moves the head to the highest node in each mention.""" def __init__(self, bugs='warn', keep_head_if_possible=True, **kwargs): self.counter = Counter() self.bugs = bugs self.keep_head_if_possible = keep_head_if_possible super().__init__(**kwargs) def _eparents(self, node): if node._raw_deps != '_': return [d['parent'] for d in node.deps] if node.parent: return [node.parent] return []
[docs] def find_head(self, mention): mwords = set(mention.words) # First, check the simplest case: no empty words and a treelet in basic dependencies. basic_heads = [w for w in mention.words if not w.parent or not w.parent in mwords] assert basic_heads if len(basic_heads) == 1: return basic_heads[0], 'treelet' # Second, check also enhanced dependencies (but only within basic_heads for simplicity). enh_heads = [w for w in basic_heads if not any(p in mwords for p in self._eparents(w))] if not enh_heads: enh_heads = [w for w in basic_heads if not all(p in mwords for p in self._eparents(w))] if not enh_heads: return mention.head, 'cycle' if len(enh_heads) == 1: return enh_heads[0], 'treelet' # Third, find non-empty parents (ancestors in future) of empty nodes. empty_nodes, non_empty = [], [] for w in enh_heads: (empty_nodes if w.is_empty() else non_empty).append(w) if empty_nodes: for empty_node in empty_nodes: parents = [d['parent'] for d in empty_node.deps if not d['parent'].is_empty()] if parents: if parents[0] not in non_empty: non_empty.append(parents[0]) else: # TODO we should climb up, but preventing cycles # We could also introduce empty_node.nonempty_ancestor if 'warn' in self.bugs: logging.warning(f"could not find non-empty parent of {empty_node} for mention {mention.head}") if 'mark' in self.bugs: node.misc['Bug'] = 'no-parent-of-empty' non_empty.sort() # Fourth, check if there is a node within the enh_heads governing all the mention nodes # and forming thus a "gappy treelet", where the head is clearly the "highest" node. (highest, added_nodes) = find_minimal_common_treelet(*non_empty) if highest in enh_heads: return highest, 'gappy' if highest in mwords: if 'warn' in self.bugs: logging.warning(f"Strange mention {mention.head} with highest node {highest}") if 'mark' in self.bugs: highest.misc['Bug'] = 'highest-in-mwords' mention.head.misc['Bug'] = 'highest-head' # Fifth, try to convervatively preserve the original head, if it is one of the possible heads. if self.keep_head_if_possible and mention.head in enh_heads: return mention.head, 'nontreelet' # Finally, return the word-order-wise first head candidate as the head. return enh_heads[0], 'nontreelet'
[docs] def process_coref_mention(self, mention): self.counter['total'] += 1 if len(mention.words) < 2: self.counter['single-word'] += 1 else: new_head, category = self.find_head(mention) self.counter[category] += 1 if new_head is mention.head: self.counter[category + '-kept'] += 1 else: self.counter[category + '-moved'] += 1 mention.head = new_head
[docs] def process_end(self): logging.info("corefud.MoveHead overview of mentions:") total = self.counter['total'] for key, value in self.counter.most_common(): logging.info(f"{key:>16} = {value:6} ({100*value/total:5.1f}%)")