Source code for udapi.block.util.markdiff

"""util.MarkDiff is a special block for marking differences between parallel trees."""
import collections
import difflib
import pprint
from udapi.core.block import Block


[docs] class MarkDiff(Block): """Mark differences between parallel trees.""" def __init__(self, gold_zone, attributes='form,lemma,upos,xpos,deprel,feats,misc', mark=1, mark_attr='Mark', add=False, print_stats=0, ignore_parent=False, align=False, align_attr='Align', **kwargs): """Create the Mark block object. Params: gold_zone: Which of the zones should be treated as gold? (The changes are interpreted as from a "pred"=predicted zone into the gold zone.) attributes: Which node attributes should be considered when searching for diffs? The tree topology, i.e. node parent is always considered. mark: What value should be used in `node.misc['Mark']` of the differing nodes? mark_attr: use this MISC attribute name instead of "Mark". Use mark_attr=0 to prevent marking diffs in MISC. add: If False, node.misc attributes Mark, ToDo and Bug will be deleted before running this block, so that the marked_only option (e.g. via `udapy -TM`) prints only nodes marked by this block. print_stats: How many lines of statistics should be printed? -1 means all. ignore_parent: ignore differences in dependency parents align: store word alignment, possible values are False (no alignment stored, the default) "from-pred", i.e. pred_node.misc["Align"] = aligned_gold_node.ord, "from-gold", i.e. gold_node.misc["Align"] = aligned_pred_node.ord and "both", i.e. both from-pred and from-gold. If only forms should be considered for inducing the word alignment, you should use "util.MarkDiff attributes='form' ignore_parent=1 align=1". Only one-to-one alignment is supported. align_attr: use this MISC attribute name instead of "Align". """ super().__init__(**kwargs) self.gold_zone = gold_zone self.attrs = attributes.split(',') self.mark = mark self.mark_attr = mark_attr self.add = add self.print_stats = print_stats self.ignore_parent = ignore_parent self.align = align self.align_attr = align_attr self.stats = collections.Counter() if not mark_attr and not align and not print_stats: raise ValueError('mark_attr=0 does not make sense without align or print_stats')
[docs] def process_tree(self, tree): gold_tree = tree.bundle.get_tree(self.gold_zone) if tree == gold_tree: return if not self.add: for node in tree.descendants + gold_tree.descendants: del node.misc[self.mark_attr] del node.misc['ToDo'] del node.misc['Bug'] pred_nodes, gold_nodes = tree.descendants, gold_tree.descendants # Make sure both pred and gold trees are marked, even if one has just deleted nodes. if len(pred_nodes) != len(gold_nodes) and self.mark_attr: tree.add_comment(f'{self.mark_attr} = {self.mark}') gold_tree.add_comment(f'{self.mark_attr} = {self.mark}') pred_tokens = ['_'.join(n.get_attrs(self.attrs, undefs="_")) for n in pred_nodes] gold_tokens = ['_'.join(n.get_attrs(self.attrs, undefs="_")) for n in gold_nodes] matcher = difflib.SequenceMatcher(None, pred_tokens, gold_tokens, autojunk=False) diffs = list(matcher.get_opcodes()) alignment = {-1: -1} for diff in diffs: edit, pred_lo, pred_hi, gold_lo, gold_hi = diff if edit in {'equal', 'replace'}: for i in range(pred_lo, pred_hi): alignment[i] = i - pred_lo + gold_lo if self.align in ("both", "from-pred"): pred_nodes[i].misc[self.align_attr] = i - pred_lo + gold_lo + 1 if self.align in ("both", "from-gold"): gold_nodes[i - pred_lo + gold_lo].misc[self.align_attr] = i + 1 for diff in diffs: edit, pred_lo, pred_hi, gold_lo, gold_hi = diff if edit == 'equal': for p_node, g_node in zip(pred_nodes[pred_lo:pred_hi], gold_nodes[gold_lo:gold_hi]): if not self.ignore_parent and alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1: self.stats['ONLY-PARENT-CHANGED'] += 1 if self.mark_attr: p_node.misc[self.mark_attr] = self.mark g_node.misc[self.mark_attr] = self.mark else: if self.mark_attr: for node in pred_nodes[pred_lo:pred_hi] + gold_nodes[gold_lo:gold_hi]: node.misc[self.mark_attr] = self.mark if self.print_stats: if edit == 'replace': # first n nodes are treated as aligned, the rest is treated as ADDED/DELETED n = min(pred_hi - pred_lo, gold_hi - gold_lo) for p_node, g_node in zip(pred_nodes[pred_lo:pred_lo + n], gold_nodes[gold_lo:gold_lo + n]): for attr in self.attrs: p_value, g_value = p_node._get_attr(attr), g_node._get_attr(attr) if p_value != g_value: self.stats[f'{attr.upper()}: {p_value} -> {g_value}'] += 1 if not self.ignore_parent and alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1: self.stats['PARENT-CHANGED'] += 1 pred_lo, gold_lo = pred_lo + n, gold_lo + n for node in gold_nodes[gold_lo:gold_hi]: self.stats['ADD-WORD'] += 1 self.stats['ADD-LEMMA: ' + node.lemma] += 1 for node in pred_nodes[pred_lo:pred_hi]: self.stats['DELETE-WORD'] += 1 self.stats['DELETE-LEMMA: ' + node.lemma] += 1
[docs] def process_end(self): if self.print_stats: how_many = None if self.print_stats in (-1, '-1') else self.print_stats for edit, count in self.stats.most_common(how_many): print(f'{count:4} {edit}')