Source code for udapi.block.demo.complexity

"""demo.Complexity prints statistics on syntactic complexity.
"""
from udapi.core.basewriter import BaseWriter
from collections import deque


[docs] def non_punct(nodes): return [n for n in nodes if n.upos != 'PUNCT']
[docs] def is_np(node): return node.upos in ("NOUN", "PROPN") or (node.upos == "PRON" and node.feats["PronType"] == "Prs" and not node.feats["Poss"])
[docs] def is_vp(node): """E.g. prosili, naléhali a žadonili => 1 coordinated verb phrase, head “prosili”. [POS == “VERB”, [deprel == “conj”, POS == “VERB”]], unique coordination heads TODO: zahrnout i non-VERB? - vznikla a byla přijata(conj,ADJ,parent=vznikla) - je(cop,AUX) nešťastný(ADJ) a nechá(conj,VERB,parent=nešťastný) se nalákat - "podařilo se to a dokladem(ClauseHead,NOUN,conj,parent=podařilo) je(cop,AUX,parent=dokladem)" - omezit se jen na (či využít) ClauseHead, nebo zahrnout i non-finite verbs (koordinace infinitivů či příčestí)? "stihl(ClauseHead) napsat(VerbForm=Inf) a publikovat(VerbForm=Inf)" ... napsat ani publikovat nejsou ClauseHead "rozhodl se ukončit a ukazuje(ClauseHead,parent=ukončit)" správně by mělo být parent=rozhodl, ale parser dělá chyby. - Parsing vůbec dělá mnoho chyb v koordinacích, takže je vhodné podmínky velmi omezit. """ return node.upos == "VERB" or node.misc["ClauseHead"]
[docs] def is_relcl(node): """Is a given node a head of a relative clause? Unfortunatelly, UDPipe 2.4 produces just acl instead of acl:relcl. """ if node.deprel == 'acl:relcl': return True return node.udeprel == 'acl' and any('Rel' in c.feats['PronType'] for c in node.children)
[docs] def is_postponed_nom_mod(node): """Is a given node a postponed nominal modifier? Silvie: [(POS in {“NOUN”, “PROPN”} | POS == “PRON” & feats:PronType == “Prs” & !(feats:Poss==”Yes”)), child with higher word order than parent [deprel != “conj”, POS in {“NOUN”, “PROPN”} | POS == “PRON” & feats:PronType == “Prs” & !(feats:Poss==”Yes”)] TODO: Tohle hledá v češtině zcela běžné jevy jako "vznik díla". Nechceme hledat něco jiného? """ return node.udeprel != 'conj' and is_np(node) and node.parent.precedes(node) and is_np(node.parent)
[docs] def is_postponed_adj_mod(node): # TODO můžeme rozlišovat holý přívlastek ("písní ruských") a rozvitý ("milenec známý z pozdějšího zpracování") return node.parent.precedes(node) and is_np(node.parent) and node.upos == 'ADJ' #and not node.children
[docs] def is_complex_nominal(node): """[(POS in {“NOUN”, “PROPN”} | POS == “PRON” & feats:PronType == “Prs” & !(feats:Poss==”Yes”)) 2x descendant [deprel != “conj”]] TODO: punct, case, cc a dep taky ignorovat? TODO: opravdu descendants a ne children? (descendants snadno roste nad všechny meze, je-li tam třeba vedlejší věta) TODO: beztak bude chtít odfiltrovat copuly: "Jádrem tvorby jsou sbírky." - Jádrem má 3 děti. TODO: a nezvýšit ten limit z 2x aspoň na 3x? """ return is_np(node) and len([n for n in node.descendants if n.deprel not in ('conj', 'punct', 'case', 'cc', 'dep', 'cop')]) > 1
[docs] def is_finite_clause_head(node): """Is a given node a head of a finite clause? Silvie: [(POS == „VERB“ & feats:Verbform == „Fin“ | Verbform == „Part“} ) ] OR [(POS in {„ADJ“, „NOUN“, „PROPN“}, [child POS == „AUX“)]] - POS == „VERB“ je zbytečné, protože VerbForm=Part je nastaveno i u ADJ ("je nucen" apod.) - child POS == „AUX“ zase matchuje i např. na "Vidím psa(NOUN), který je(AUX,acl,parent=psa) z dávné doby." - adjectivized predicates (převažující(VerbForm=Part) básně) by neměly být určeny jako clause_head * Most finite verbs with deprel=amod are parsing errors - they should have deprel=acl, but for better robustness we include these as well. * Similarly "dep" and "orphan" are mostly parsing errors. * TODO: by uncommenting the nsubj/csubj line, we find few more real clause heads, but also some false positives. """ # TODO appos if ((node.udeprel in {'root', 'conj', 'acl', 'advcl', 'ccomp', 'csubj', 'obl', 'parataxis', 'amod', 'dep', 'orphan'} and is_finite_verb(node)) #or any(c.udeprel in {'nsubj', 'csubj'} for c in node.children) or (any(c.udeprel == 'cop' for c in node.children) and node.udeprel != 'xcomp')): return True xcomp_child = next((c for c in node.children if c.udeprel == 'xcomp'), None) return xcomp_child and any(c.udeprel == 'cop' for c in xcomp_child.children)
# TODO: zahrnout i: bude(aux,AUX,parent=chovat) se chovat(VERB,VerbForm=Inf)
[docs] def is_finite_verb(node): return (node.feats['VerbForm'] in {'Fin', 'Part'} and (node.upos == 'VERB' or node.upos == 'ADJ' and any(c.deprel == 'aux:pass' for c in node.children)))
[docs] def is_adjectivized_predicate(node): """E.g. kouřící komín, zbitý kluk Silvie: [(POS == „ADJ“ & feats:VerbForm == „Part“), parent [POS in {„NOUN“, „PROPN“}] ] - parent [POS in {„NOUN“, „PROPN“}] zamezí případům jako "kvůli nesmyslné a stupňující(parent=nesmyslné,deprel=conj) se žárlivosti" "Nové pronikající(parent=Nové,deprel=amod) socialistické myšlení" asi chyba parsingu, mělo být parent=myšlení? - dotaz naopak matchuje na "způsob, jakým jsou popsány", proto přidávám podmínku not node.misc["ClauseHead"] """ return (node.feats["VerbForm"] == "Part" and node.upos == "ADJ" and (node.parent.upos in {"NOUN","PROPN"} or (node.udeprel == "conj" and node.parent.upos == "ADJ")) and not node.misc["ClauseHead"])
[docs] def is_controlled_predicate(node): """E.g. Mohli jsme odejít i zůstat. TODO: Chceme zahrnout i druhý a další člen koordinace, např. "stihl napsat a publikovat", tedy node.udeprel == "conj" and node.parent.udeprel == "xcomp"? """ return node.deprel == "xcomp"
[docs] class Complexity(BaseWriter): def __init__(self, matches=False, **kwargs): super().__init__(**kwargs) self.matches = matches
[docs] def report(self, category, groups, expand_type='no'): if self.matches: for group in groups: self.print_match(category, group, expand_type) else: print("\t" + str(len(groups)), end='')
[docs] def expand_subtree(self, nodes, expand_type): if expand_type == 'no': return nodes if len(nodes) > 1: raise Exception("expanding more than one node not implemented yet") if expand_type == 'subtree': return nodes[0].descendants(add_self=True) #if expand_type == 'subtree_except_conj': #result = nodes #for child in group.children: #if child.udeprel != 'conj': #result.extend(child.descendants(add_self=True)) #return = sorted(result) if expand_type == 'subtree_within_clause': stack = [n for n in nodes[0].children if n.udeprel != 'conj'] while stack: node = stack.pop() if not node.misc["ClauseHead"]: nodes.append(node) stack.extend(node.children()) return sorted(nodes) raise ValueError("unknown expand value " + expand_type)
[docs] def print_match(self, category, group, expand_type='no'): nodes = self.expand_subtree(group, expand_type) lemmas = " ".join(n.lemma for n in nodes) tags = " ".join(n.upos for n in nodes) n_tokens = str(len(non_punct(nodes))) print("\t".join([category, nodes[0].root.sent_id, lemmas, tags, n_tokens]))
[docs] def get_main_clauses(self, root): main_heads = [] for main_head in root.children: main_heads.append(main_head) main_heads.extend(n for n in main_head.children if n.udeprel == 'conj') return [[n] for n in main_heads]
[docs] def get_coord_phrase(self, root, phrase_type_function): results = [] for node in root.descendants: if phrase_type_function(node): conjuncts = [n for n in node.children if n.udeprel == 'conj' and phrase_type_function(n)] if conjuncts: conjunctions = [] for conj in conjuncts: # TODO multiword conjunctions (udeprel=flat)? conjunctions.extend([n for n in conj.children if n.udeprel == 'cc']) results.append(sorted([node] + conjuncts + conjunctions)) return results
# TODO koordinace hlavních i vedlejších vět
[docs] def get_t_units(self, main_heads): results = [] for main_head in main_heads: main_clause = [main_head] dep_heads = [] stack = main_head.children while stack: node = stack.pop() if node.misc["ClauseHead"]: dep_heads.append(node) else: main_clause.append(node) stack.extend(node.children) main_clause = sorted(main_clause) for dep_clause_head in dep_heads: results.append(main_clause + self.expand_subtree([dep_clause_head], 'subtree')) return results
# TODO complex t-unit má jinou definici: 3 klauze
[docs] def get_complex_t_units(self, root): results = [] for node in root.descendants: if node.deprel != 'root' and node.misc["ClauseHead"]: # TODO: exclude the main clause? results += self.get_t_units([node]) return results
[docs] def process_tree(self, root): print("# " + root.text) allnodes = root.descendants depth, clause_depth = {0: 0}, {0: 0} queue = deque(root.children) clause_heads = [] while queue: node = queue.popleft() depth[node.ord] = depth[node.parent.ord] + 1 clause_depth[node.ord] = clause_depth[node.parent.ord] if is_finite_clause_head(node): node.misc['ClauseHead'] = 1 clause_heads.append(node) clause_depth[node.ord] += 1 queue.extend(node.children) max_depth = sorted(depth.values())[-1] max_clause_depth = sorted(clause_depth.values())[-1] t_units = self.get_t_units([n for n in root.children if n.deprel == 'root']) total_t_units_length = sum(len(t_unit) for t_unit in t_units) mean_t_unit_length = total_t_units_length / (len(t_units) or 1) # TODO co reportovat, když věta nemá žádné t-units? if not self.matches: print("\t".join(str(x) for x in [root.sent_id, len(non_punct(allnodes)), max_depth, max_clause_depth, mean_t_unit_length]), end='') self.report("clauses", [[n] for n in clause_heads], 'subtree') self.report("adjectivized_predicates", [[n] for n in allnodes if is_adjectivized_predicate(n)]) self.report("controlled_predicates", [[n] for n in allnodes if is_controlled_predicate(n)]) self.report("main_clauses", self.get_main_clauses(root), 'subtree_within_clause') self.report("coordinated_verb_phrases", self.get_coord_phrase(root, is_vp)) self.report("coordinated_noun_phrases", self.get_coord_phrase(root, is_np)) self.report("coordinated_adjective_phrases", self.get_coord_phrase(root, lambda n: n.upos in ("ADJ", "DET"))) self.report("coordinated_adverb_phrases", self.get_coord_phrase(root, lambda n: n.upos == "ADV")) self.report("t-units", t_units) self.report("complex_t-units", self.get_complex_t_units(root)) # TODO: najde "básně a písně" a "rychtář a rychtářka" UDPipe kdovíproč určil jako ADV a ADV. Zkontrolovat, máme-li nejlepší možný UDPipe model. self.report("relative_clauses", [[n] for n in allnodes if is_relcl(n)], 'subtree_within_clause') self.report("postponed_nominal_modifiers", [[n] for n in allnodes if is_postponed_nom_mod(n)]) self.report("postponed_adjective_modifiers", [[n] for n in allnodes if is_postponed_adj_mod(n)]) self.report("complex_nominals", [[n] for n in allnodes if is_complex_nominal(n)]) if not self.matches: # TODO: pro total koordinace asi nemá smysl reportovat matches, jen total count? self.report("coordinated_phrases_total", self.get_coord_phrase(root, lambda _: True)) nonpunct_upos = [n.upos for n in non_punct(allnodes)] + ['NONE', 'NONE'] brackets = str(len([n for n in allnodes if n.form == '('])) dashes = str(len([n for n in allnodes if n.form in '-–—―'])) # hyphen, en-dash, em-dash, horizonatal bar colons = str(len([n for n in allnodes if n.form == ':'])) semicolons = str(len([n for n in allnodes if n.form == ';'])) print("\t", "\t".join([nonpunct_upos[0], nonpunct_upos[1], brackets, dashes, colons, semicolons]))