Source code for udapi.block.corefud.printmentions

import random
from collections import Counter
from udapi.core.block import Block
from udapi.block.write.textmodetreeshtml import TextModeTreesHtml
from udapi.block.write.textmodetrees import TextModeTrees


[docs]
class PrintMentions(Block):
    """Print mentions with various properties."""

    def __init__(self, continuous='include', almost_continuous='include', treelet='include',
                 forest='include', almost_forest='include', oneword='include', singleton='include',
                 empty='include', max_trees=0, html=False, shuffle=True, print_other_forms=5,
                 print_total=True, print_should=True,
                 print_sent_id=True, print_text=True, add_empty_line=True, indent=1,
                 minimize_cross=True, color=True, attributes='ord,form,upos,deprel,misc',
                 print_undef_as='_', print_doc_meta=True, print_comments=False,
                 mark='(Mark)', hints=True, layout='classic',
                 **kwargs):
        super().__init__(**kwargs)
        self.continuous = self._convert(continuous)
        self.almost_continuous = self._convert(almost_continuous)
        self.treelet = self._convert(treelet)
        self.forest = self._convert(forest)
        self.almost_forest = self._convert(almost_forest)
        self.oneword = self._convert(oneword)
        self.singleton = self._convert(singleton)
        self.empty = self._convert(empty)

        self.max_trees = max_trees
        self.html = html
        self.shuffle = shuffle
        if shuffle:
            random.seed(42)
        self.print_other_forms = print_other_forms
        self.print_total = print_total,
        self.print_should = print_should,
        print_class = TextModeTreesHtml if html else TextModeTrees
        self.print_block = print_class(
                print_sent_id=print_sent_id, print_text=print_text, add_empty_line=add_empty_line, indent=indent,
                minimize_cross=minimize_cross, color=color, attributes=attributes,
                print_undef_as=print_undef_as, print_doc_meta=print_doc_meta, print_comments=print_comments,
                mark=mark, hints=hints, layout=layout)

    def _convert(self, value):
        if value in {'include', 'exclude', 'only'}:
            return value
        if value == 1:
            return 'only'
        if value == 0:
            return 'exclude'
        raise ValueError('unknown value ' + value)


[docs]
    def before_process_document(self, document):
        self.print_block.before_process_document(document)



[docs]
    def after_process_document(self, document):
        self.print_block.after_process_document(document)


    def _ok(self, condition, value):
        if value == 'include':
            return True
        return (condition and value == 'only') or (not condition and value=='exclude')

    def _is_auxiliary_etc(self, node):
        if node.udeprel in {'case', 'cc', 'conj', 'mark', 'appos', 'vocative', 'discourse'}:
            return True
        if node.deprel == 'advmod:emph':
            return True
        if node.udeprel == 'dep' and node.upos in {'ADP', 'SCONJ', 'CCONJ', 'PUNCT'}:
            return True
        return False

    def _is_forest(self, mention, mwords, almost):
        for w in mention.words:
            # UD unfortunatelly does not use the copula-as-head style for copula construction,
            # so e.g. in "It is my fault", "fault" is the root of the tree and all other words its children.
            # However, in the cop-as-head stule, only "my" would depend on "fault" (and should be part of the mention).
            # It is difficult to tell apart which w.children are related to w and which to the copula.
            # We thus ignore these cases completely (we expect any child is potentially related to the copula).
            if any(ch.udeprel == 'cop' for ch in w.children):
                continue
            for ch in w.children:
                if ch not in mwords:
                    if not almost:
                        if self.print_should:
                            ch.misc["ShouldBeInSpanOf"] = mention.entity.eid
                        return False
                    # Punctuation before or after the mention span can depend on any of the mwords
                    # without breaking the almost_forest property.
                    # According to the UD guidelines, it should depend on the highest node within the phrase,
                    # i.e. on the mention head, but it is not our goal now to check UD punctuation guidelines.
                    if ch.udeprel == 'punct' and (ch < mention.words[0] or ch > mention.words[-1]):
                        continue
                    # Some auxiliary words (e.g. prepositions) may be excluded from the mention span
                    # without breaking the almost_forest property, but they need to depend
                    # on the mention head (or if the mention is not a catena, they need to depend
                    # on one of the potential heads, i.e. a node from mwords whose parent is not in mwords).
                    # For example: "A gift for (e1 John)" is almost_forest ("for" depends on "John" which is the mention head),
                    # but          "(e1[1/2] John) with (e1[2/2]) Mary" is not almost_forest
                    # because "with" depends on "Mary", which is not the mention head (nor a potential mention head).
                    if not (w.parent and w.parent not in mwords and self._is_auxiliary_etc(ch)):
                        if self.print_should:
                            ch.misc["ShouldBeInSpanOf"] = mention.entity.eid
                        return False
        return True

    def _is_almost_continuous(self, mention):
        if ',' not in mention.span:
            return True
        nonempty = [w for w in mention.words if not w.is_empty()]
        if not nonempty:
            return True
        mwords = set(mention.words)
        gap_nodes = [w for w in mention.head.root.descendants if w > nonempty[0] and w < nonempty[-1] and not w in mwords]
        for gap_node in gap_nodes:
            if not gap_node.is_empty():
                return False
        return True


[docs]
    def process_document(self, doc):
        mentions = []
        for entity in doc.coref_entities:
            if self._ok(len(entity.mentions) == 1, self.singleton):
                mentions.extend(entity.mentions)
        if self.shuffle:
            random.shuffle(mentions)
        else:
            mentions.sort()

        seen_trees = 0
        for mention in mentions:
            if not self._ok(len(mention.words) == 1, self.oneword):
                continue
            if not self._ok(',' not in mention.span, self.continuous):
                continue
            if self.almost_continuous != 'include' and not self._ok(self._is_almost_continuous(mention), self.almost_continuous):
                continue

            empty_mwords = [w for w in mention.words if w.is_empty()]
            if not self._ok(len(empty_mwords) > 0, self.empty):
                continue

            heads, mwords = 0, set(mention.words)
            for w in mention.words:
                if w.parent:
                    heads += 0 if w.parent in mwords else 1
                else:
                    heads += 0 if any(d['parent'] in mwords for d in w.deps) else 1
            if not self._ok(heads <= 1, self.treelet):
                continue
            if self.forest != 'include' and not self._ok(self._is_forest(mention, mwords, False), self.forest):
                continue
            if self.almost_forest != 'include' and not self._ok(self._is_forest(mention, mwords, True), self.almost_forest):
                continue

            for w in mention.words:
                w.misc['Mark'] = 1

            seen_trees += 1
            if self.max_trees and seen_trees > self.max_trees:
                if not self.print_total:
                    print(f'######## Only first {self.max_trees} matching mentions printed. Use max_trees=0 to see all.')
                    return
            else:
                this_form = ' '.join([w.form for w in mention.words])
                print("# Mention = " + this_form)
                if self.print_other_forms:
                    counter = Counter()
                    for m in mention.entity.mentions:
                        forms = ' '.join([w.form for w in m.words])
                        if forms != this_form:
                            counter[forms] += 1
                    if counter:
                        print(f"# {min(len(counter), self.print_other_forms)} other forms:", end='')
                        for form, count in counter.most_common(self.print_other_forms):
                            print(f' "{form}"({count})', end='')
                        print()
                self.print_block.process_tree(mention.head.root)
                for w in mention.words:
                    del w.misc['Mark']

        if self.print_total:
            if self.max_trees and seen_trees > self.max_trees:
                print(f'######## Only first {self.max_trees} matching mentions printed. Use max_trees=0 to see all.')
            print(f'######## Total matching/all mentions = {seen_trees} / {len(mentions)}')