Source code for udapi.block.ud.markfeatsbugs

"""
Block to identify missing or ill-valued features in a treebank. Any bugs that it
finds will be saved in the MISC column as a Bug attribute, which can be later
used in filters and highlighted in text output. This is a base block that only
implements service methods. A language-specific block must be derived from this
one and define the actual rules valid in that language.

Usage (Czech example): cat *.conllu | udapy -HAMX layout=compact ud.cs.MarkFeatsBugs > bugs.html
"""
from udapi.core.block import Block
import logging
import re

[docs] class MarkFeatsBugs(Block):
[docs] def bug(self, node, bugstring): bugs = [] if node.misc['Bug']: bugs = node.misc['Bug'].split('+') if not bugstring in bugs: bugs.append(bugstring) node.misc['Bug'] = '+'.join(bugs)
[docs] def check_allowed_features(self, node, allowed): """ We need a dictionary indexed by feature names that are allowed; for each feature name, there is a list of allowed values. """ # Check for features that are not allowed but the node has them. # For features that are allowed, check that their values are allowed. for f in node.feats: if f in allowed: if not node.feats[f] in allowed[f]: self.bug(node, 'Feat' + f + 'Value' + node.feats[f] + 'NotAllowed') else: self.bug(node, 'Feat' + f + 'NotAllowed')
[docs] def check_required_features(self, node, required): """ We need a list of names of features whose values must not be empty. """ for f in required: if not f in node.feats: self.bug(node, 'Feat' + f + 'Missing')
[docs] def process_node(self, node): """ This is a generic block, do nothing here. In a language-specific block based on this one, rules similar to the examples below can be specified: # NOUNS ################################################################ if node.upos == 'NOUN': self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity']) if node.feats['Gender'] == 'Masc': self.check_required_features(node, ['Animacy']) self.check_allowed_features(node, { 'Gender': ['Masc', 'Fem', 'Neut'], 'Animacy': ['Anim', 'Inan'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Polarity': ['Pos', 'Neg'], 'Foreign': ['Yes']}) else: self.check_allowed_features(node, { 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Polarity': ['Pos', 'Neg'], 'Foreign': ['Yes']}) #... # THE REST: NO FEATURES ################################################ else: self.check_allowed_features(node, {}) """ return