Source code for udapi.block.util.markdiff

"""util.MarkDiff is a special block for marking differences between parallel trees."""
import collections
import difflib
import pprint
from udapi.core.block import Block



[docs]
class MarkDiff(Block):
    """Mark differences between parallel trees."""

    def __init__(self, gold_zone, attributes='form,lemma,upos,xpos,deprel,feats,misc',
                 mark=1, mark_attr='Mark', add=False, print_stats=0, ignore_parent=False,
                 align=False, align_attr='Align', **kwargs):
        """Create the Mark block object.
        Params:
        gold_zone: Which of the zones should be treated as gold?
            (The changes are interpreted as from a "pred"=predicted zone into the gold zone.)
        attributes: Which node attributes should be considered when searching for diffs?
            The tree topology, i.e. node parent is always considered.
        mark: What value should be used in `node.misc['Mark']` of the differing nodes?
        mark_attr: use this MISC attribute name instead of "Mark".
            Use mark_attr=0 to prevent marking diffs in MISC.
        add: If False, node.misc attributes Mark, ToDo and Bug will be deleted before running this block,
            so that the marked_only option (e.g. via `udapy -TM`) prints only nodes marked by this block.
        print_stats: How many lines of statistics should be printed? -1 means all.
        ignore_parent: ignore differences in dependency parents
        align: store word alignment, possible values are False (no alignment stored, the default)
            "from-pred", i.e. pred_node.misc["Align"] = aligned_gold_node.ord,
            "from-gold", i.e. gold_node.misc["Align"] = aligned_pred_node.ord and
            "both", i.e. both from-pred and from-gold.
            If only forms should be considered for inducing the word alignment,
            you should use "util.MarkDiff attributes='form' ignore_parent=1 align=1".
            Only one-to-one alignment is supported.
        align_attr: use this MISC attribute name instead of "Align".
        """
        super().__init__(**kwargs)
        self.gold_zone = gold_zone
        self.attrs = attributes.split(',')
        self.mark = mark
        self.mark_attr = mark_attr
        self.add = add
        self.print_stats = print_stats
        self.ignore_parent = ignore_parent
        self.align = align
        self.align_attr = align_attr
        self.stats = collections.Counter()
        if not mark_attr and not align and not print_stats:
            raise ValueError('mark_attr=0 does not make sense without align or print_stats')


[docs]
    def process_tree(self, tree):
        gold_tree = tree.bundle.get_tree(self.gold_zone)
        if tree == gold_tree:
            return
        if not self.add:
            for node in tree.descendants + gold_tree.descendants:
                del node.misc[self.mark_attr]
                del node.misc['ToDo']
                del node.misc['Bug']

        pred_nodes, gold_nodes = tree.descendants, gold_tree.descendants
        # Make sure both pred and gold trees are marked, even if one has just deleted nodes.
        if len(pred_nodes) != len(gold_nodes) and self.mark_attr:
            tree.add_comment(f'{self.mark_attr} = {self.mark}')
            gold_tree.add_comment(f'{self.mark_attr} = {self.mark}')
        pred_tokens = ['_'.join(n.get_attrs(self.attrs, undefs="_")) for n in pred_nodes]
        gold_tokens = ['_'.join(n.get_attrs(self.attrs, undefs="_")) for n in gold_nodes]
        matcher = difflib.SequenceMatcher(None, pred_tokens, gold_tokens, autojunk=False)
        diffs = list(matcher.get_opcodes())

        alignment = {-1: -1}
        for diff in diffs:
            edit, pred_lo, pred_hi, gold_lo, gold_hi = diff
            if edit in {'equal', 'replace'}:
                for i in range(pred_lo, pred_hi):
                    alignment[i] = i - pred_lo + gold_lo
                    if self.align in ("both", "from-pred"):
                        pred_nodes[i].misc[self.align_attr] = i - pred_lo + gold_lo + 1
                    if self.align in ("both", "from-gold"):
                        gold_nodes[i - pred_lo + gold_lo].misc[self.align_attr] = i + 1

        for diff in diffs:
            edit, pred_lo, pred_hi, gold_lo, gold_hi = diff
            if edit == 'equal':
                for p_node, g_node in zip(pred_nodes[pred_lo:pred_hi], gold_nodes[gold_lo:gold_hi]):
                    if not self.ignore_parent and alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1:
                        self.stats['ONLY-PARENT-CHANGED'] += 1
                        if self.mark_attr:
                            p_node.misc[self.mark_attr] = self.mark
                            g_node.misc[self.mark_attr] = self.mark
            else:
                if self.mark_attr:
                    for node in pred_nodes[pred_lo:pred_hi] + gold_nodes[gold_lo:gold_hi]:
                        node.misc[self.mark_attr] = self.mark
                if self.print_stats:
                    if edit == 'replace':
                        # first n nodes are treated as aligned, the rest is treated as ADDED/DELETED
                        n = min(pred_hi - pred_lo, gold_hi - gold_lo)
                        for p_node, g_node in zip(pred_nodes[pred_lo:pred_lo + n], gold_nodes[gold_lo:gold_lo + n]):
                            for attr in self.attrs:
                                p_value, g_value = p_node._get_attr(attr), g_node._get_attr(attr)
                                if p_value != g_value:
                                    self.stats[f'{attr.upper()}: {p_value} -> {g_value}'] += 1
                            if not self.ignore_parent and alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1:
                                self.stats['PARENT-CHANGED'] += 1
                        pred_lo, gold_lo = pred_lo + n, gold_lo + n
                    for node in gold_nodes[gold_lo:gold_hi]:
                        self.stats['ADD-WORD'] += 1
                        self.stats['ADD-LEMMA: ' + node.lemma] += 1
                    for node in pred_nodes[pred_lo:pred_hi]:
                        self.stats['DELETE-WORD'] += 1
                        self.stats['DELETE-LEMMA: ' + node.lemma] += 1



[docs]
    def process_end(self):
        if self.print_stats:
            how_many = None if self.print_stats in (-1, '-1') else self.print_stats
            for edit, count in self.stats.most_common(how_many):
                print(f'{count:4} {edit}')