Source code for udapi.block.corefud.printmentions

import random
from collections import Counter
from udapi.core.block import Block
from udapi.block.write.textmodetreeshtml import TextModeTreesHtml
from udapi.block.write.textmodetrees import TextModeTrees

[docs] class PrintMentions(Block): """Print mentions with various properties.""" def __init__(self, continuous='include', almost_continuous='include', treelet='include', forest='include', almost_forest='include', oneword='include', singleton='include', empty='include', max_trees=0, html=False, shuffle=True, print_other_forms=5, print_total=True, print_should=True, print_sent_id=True, print_text=True, add_empty_line=True, indent=1, minimize_cross=True, color=True, attributes='ord,form,upos,deprel,misc', print_undef_as='_', print_doc_meta=True, print_comments=False, mark='(Mark)', hints=True, layout='classic', **kwargs): super().__init__(**kwargs) self.continuous = self._convert(continuous) self.almost_continuous = self._convert(almost_continuous) self.treelet = self._convert(treelet) self.forest = self._convert(forest) self.almost_forest = self._convert(almost_forest) self.oneword = self._convert(oneword) self.singleton = self._convert(singleton) self.empty = self._convert(empty) self.max_trees = max_trees self.html = html self.shuffle = shuffle if shuffle: random.seed(42) self.print_other_forms = print_other_forms self.print_total = print_total, self.print_should = print_should, print_class = TextModeTreesHtml if html else TextModeTrees self.print_block = print_class( print_sent_id=print_sent_id, print_text=print_text, add_empty_line=add_empty_line, indent=indent, minimize_cross=minimize_cross, color=color, attributes=attributes, print_undef_as=print_undef_as, print_doc_meta=print_doc_meta, print_comments=print_comments, mark=mark, hints=hints, layout=layout) def _convert(self, value): if value in {'include', 'exclude', 'only'}: return value if value == 1: return 'only' if value == 0: return 'exclude' raise ValueError('unknown value ' + value)
[docs] def before_process_document(self, document): self.print_block.before_process_document(document)
[docs] def after_process_document(self, document): self.print_block.after_process_document(document)
def _ok(self, condition, value): if value == 'include': return True return (condition and value == 'only') or (not condition and value=='exclude') def _is_auxiliary_etc(self, node): if node.udeprel in {'case', 'cc', 'conj', 'mark', 'appos', 'vocative', 'discourse'}: return True if node.deprel == 'advmod:emph': return True if node.udeprel == 'dep' and node.upos in {'ADP', 'SCONJ', 'CCONJ', 'PUNCT'}: return True return False def _is_forest(self, mention, mwords, almost): for w in mention.words: # UD unfortunatelly does not use the copula-as-head style for copula construction, # so e.g. in "It is my fault", "fault" is the root of the tree and all other words its children. # However, in the cop-as-head stule, only "my" would depend on "fault" (and should be part of the mention). # It is difficult to tell apart which w.children are related to w and which to the copula. # We thus ignore these cases completely (we expect any child is potentially related to the copula). if any(ch.udeprel == 'cop' for ch in w.children): continue for ch in w.children: if ch not in mwords: if not almost: if self.print_should: ch.misc["ShouldBeInSpanOf"] = mention.entity.eid return False # Punctuation before or after the mention span can depend on any of the mwords # without breaking the almost_forest property. # According to the UD guidelines, it should depend on the highest node within the phrase, # i.e. on the mention head, but it is not our goal now to check UD punctuation guidelines. if ch.udeprel == 'punct' and (ch < mention.words[0] or ch > mention.words[-1]): continue # Some auxiliary words (e.g. prepositions) may be excluded from the mention span # without breaking the almost_forest property, but they need to depend # on the mention head (or if the mention is not a catena, they need to depend # on one of the potential heads, i.e. a node from mwords whose parent is not in mwords). # For example: "A gift for (e1 John)" is almost_forest ("for" depends on "John" which is the mention head), # but "(e1[1/2] John) with (e1[2/2]) Mary" is not almost_forest # because "with" depends on "Mary", which is not the mention head (nor a potential mention head). if not (w.parent and w.parent not in mwords and self._is_auxiliary_etc(ch)): if self.print_should: ch.misc["ShouldBeInSpanOf"] = mention.entity.eid return False return True def _is_almost_continuous(self, mention): if ',' not in mention.span: return True nonempty = [w for w in mention.words if not w.is_empty()] if not nonempty: return True mwords = set(mention.words) gap_nodes = [w for w in mention.head.root.descendants if w > nonempty[0] and w < nonempty[-1] and not w in mwords] for gap_node in gap_nodes: if not gap_node.is_empty(): return False return True
[docs] def process_document(self, doc): mentions = [] for entity in doc.coref_entities: if self._ok(len(entity.mentions) == 1, self.singleton): mentions.extend(entity.mentions) if self.shuffle: random.shuffle(mentions) else: mentions.sort() seen_trees = 0 for mention in mentions: if not self._ok(len(mention.words) == 1, self.oneword): continue if not self._ok(',' not in mention.span, self.continuous): continue if self.almost_continuous != 'include' and not self._ok(self._is_almost_continuous(mention), self.almost_continuous): continue empty_mwords = [w for w in mention.words if w.is_empty()] if not self._ok(len(empty_mwords) > 0, self.empty): continue heads, mwords = 0, set(mention.words) for w in mention.words: if w.parent: heads += 0 if w.parent in mwords else 1 else: heads += 0 if any(d['parent'] in mwords for d in w.deps) else 1 if not self._ok(heads <= 1, self.treelet): continue if self.forest != 'include' and not self._ok(self._is_forest(mention, mwords, False), self.forest): continue if self.almost_forest != 'include' and not self._ok(self._is_forest(mention, mwords, True), self.almost_forest): continue for w in mention.words: w.misc['Mark'] = 1 seen_trees += 1 if self.max_trees and seen_trees > self.max_trees: if not self.print_total: print(f'######## Only first {self.max_trees} matching mentions printed. Use max_trees=0 to see all.') return else: this_form = ' '.join([w.form for w in mention.words]) print("# Mention = " + this_form) if self.print_other_forms: counter = Counter() for m in mention.entity.mentions: forms = ' '.join([w.form for w in m.words]) if forms != this_form: counter[forms] += 1 if counter: print(f"# {min(len(counter), self.print_other_forms)} other forms:", end='') for form, count in counter.most_common(self.print_other_forms): print(f' "{form}"({count})', end='') print() self.print_block.process_tree(mention.head.root) for w in mention.words: del w.misc['Mark'] if self.print_total: if self.max_trees and seen_trees > self.max_trees: print(f'######## Only first {self.max_trees} matching mentions printed. Use max_trees=0 to see all.') print(f'######## Total matching/all mentions = {seen_trees} / {len(mentions)}')