Source code for udapi.block.ud.es.addmwt

"""Block ud.es.AddMwt for heuristic detection of Spanish contractions.

According to the UD guidelines, contractions such as "del" = "de el"
should be annotated using multi-word tokens.

Note that this block should be used only for converting legacy conllu files.
Ideally a tokenizer should have already split the MWTs.
"""
import re
import udapi.block.ud.addmwt

MWTS = {
    'al':      {'form': 'a el'},
    'del':     {'form': 'de el'},
}

LEMMA = {
    'se': 'él',
    'le': 'él',
    'la': 'él',
    'lo': 'él',
    'te': 'tú',
    'me': 'yo',
}

# shared values for all entries in MWTS
for v in MWTS.values():
    v['lemma'] = v['form']
    v['upos'] = 'ADP DET'
    v['deprel'] = '* det'
    v['feats'] = '_ Definite=Def|Gender=Masc|Number=Sing|PronType=Art'
    # The following are the default values
    # v['main'] = 0 # which of the two words will inherit the original children (if any)
    # v['shape'] = 'siblings', # the newly created nodes will be siblings


[docs] class AddMwt(udapi.block.ud.addmwt.AddMwt): """Detect and mark MWTs (split them into words and add the words to the tree).""" def __init__(self, verbpron=False, **kwargs): super().__init__(**kwargs) self.verbpron = verbpron
[docs] def multiword_analysis(self, node): """Return a dict with MWT info or None if `node` does not represent a multiword token.""" analysis = MWTS.get(node.form.lower(), None) if analysis is not None: # Modify the default attachment of the new syntactic words in special situations. if re.match(r'^(root|conj|reparandum)$', node.udeprel): # Copy the dictionary so that we do not modify the original and do not affect subsequent usages. analysis = analysis.copy() analysis['shape'] = 'subtree' return analysis if not self.verbpron or node.upos not in {'VERB', 'AUX'}: return None form = node.form.lower() if re.search('(me|la|le|lo|se|te)$', form): verbform = node.feats['VerbForm'] # TODO there are contractions even with VerbForm=Fin if verbform == 'Fin' or form == 'pese': return None del node.feats['VerbForm'] pron = form[-2:] return { 'form': form[:-2] + ' ' + pron, 'lemma': '* ' + LEMMA[pron], 'upos': '* PRON', 'feats': 'VerbForm=%s *' % verbform, 'deprel': '* iobj', 'main': 0, 'shape': 'subtree', } if re.search('l[oe]s$', form): verbform = node.feats['VerbForm'] if verbform == 'Fin': return None del node.feats['VerbForm'] pron = form[-3:] return { 'form': form[:-3] + ' ' + pron, 'lemma': '* él', 'upos': '* PRON', 'feats': 'VerbForm=%s *' % verbform, 'deprel': '* iobj', 'main': 0, 'shape': 'subtree', } # TODO: multiple suffixes, e.g. compratelo = compra + te + lo return None
# Sometimes "del" has a shape which is neither "siblings" nor "subtree". # E.g. in "a partir del NOUN" # "del" = "de el", but # "de" is attached to "a" (as fixed), while "el" is attached to the NOUN.
[docs] def postprocess_mwt(self, mwt): if mwt.form.lower() in {'al', 'del'} and mwt.words[1].parent.precedes(mwt.words[1]): head = mwt.words[1].next_node while head.upos not in {'NOUN', 'PROPN'}: if head.parent.precedes(head) or head.is_root(): head = mwt.words[1].next_node break head = head.parent mwt.words[1].parent = head