Source code for udapi.block.ud.es.fixverbfeats

"""Block to fix features (and potentially lemmas) of verbs in UD Spanish-PUD."""
from udapi.core.block import Block
import logging
import re

[docs] class FixVerbFeats(Block):
[docs] def process_node(self, node): """ The features assigned to verbs in Spanish PUD are often wrong, although the annotation was (reportedly) done manually. For example, infinitives are tagged with VerbForm=Fin instead of VerbForm=Inf. """ if re.match(r'^(VERB|AUX)$', node.upos): if re.search(r'[aei]r$', node.form, re.IGNORECASE): # The infinitive has no features other than VerbForm. node.feats = {} node.feats['VerbForm'] = 'Inf' node.lemma = node.form.lower() elif re.search(r'ndo$', node.form, re.IGNORECASE): if node.form.lower() != 'entiendo': # The gerund has no features other than VerbForm. # The lemma is not always straightforward but we have fixed it manually. node.feats = {} node.feats['VerbForm'] = 'Ger' elif re.search(r'([ai]d|biert|dich|fech|hech|muert|puest|vist)[oa]s?$', node.form, re.IGNORECASE): # The (past) participle has always Gender and Number. # It can be VERB/AUX (infinitive is the lemma) or ADJ (masculine singular is the lemma). # As a verb, it also has Tense=Past. As an adjective it does not have this feature (in AnCora; but why not?) gender = node.feats['Gender'] if node.feats['Gender'] else ('Masc' if re.search(r'os?$', node.form, re.IGNORECASE) else 'Fem') number = node.feats['Number'] if node.feats['Number'] else ('Plur' if re.search(r's$', node.form, re.IGNORECASE) else 'Sing') node.feats = {} node.feats['VerbForm'] = 'Part' node.feats['Tense'] = 'Past' node.feats['Gender'] = gender node.feats['Number'] = number if re.search(r'ad[oa]s?$', node.form, re.IGNORECASE): node.lemma = re.sub(r'd[oa]s?$', 'r', node.form.lower())