"""Block ud.es.AddMwt for heuristic detection of Spanish contractions.
According to the UD guidelines, contractions such as "del" = "de el"
should be annotated using multi-word tokens.
Note that this block should be used only for converting legacy conllu files.
Ideally a tokenizer should have already split the MWTs.
"""
import re
import udapi.block.ud.addmwt
MWTS = {
'al': {'form': 'a el'},
'del': {'form': 'de el'},
}
LEMMA = {
'se': 'él',
'le': 'él',
'la': 'él',
'lo': 'él',
'te': 'tú',
'me': 'yo',
}
# shared values for all entries in MWTS
for v in MWTS.values():
v['lemma'] = v['form']
v['upos'] = 'ADP DET'
v['deprel'] = '* det'
v['feats'] = '_ Definite=Def|Gender=Masc|Number=Sing|PronType=Art'
# The following are the default values
# v['main'] = 0 # which of the two words will inherit the original children (if any)
# v['shape'] = 'siblings', # the newly created nodes will be siblings
[docs]
class AddMwt(udapi.block.ud.addmwt.AddMwt):
"""Detect and mark MWTs (split them into words and add the words to the tree)."""
def __init__(self, verbpron=False, **kwargs):
super().__init__(**kwargs)
self.verbpron = verbpron
[docs]
def multiword_analysis(self, node):
"""Return a dict with MWT info or None if `node` does not represent a multiword token."""
analysis = MWTS.get(node.form.lower(), None)
if analysis is not None:
# Modify the default attachment of the new syntactic words in special situations.
if re.match(r'^(root|conj|reparandum)$', node.udeprel):
# Copy the dictionary so that we do not modify the original and do not affect subsequent usages.
analysis = analysis.copy()
analysis['shape'] = 'subtree'
return analysis
if not self.verbpron or node.upos not in {'VERB', 'AUX'}:
return None
form = node.form.lower()
if re.search('(me|la|le|lo|se|te)$', form):
verbform = node.feats['VerbForm']
# TODO there are contractions even with VerbForm=Fin
if verbform == 'Fin' or form == 'pese':
return None
del node.feats['VerbForm']
pron = form[-2:]
return {
'form': form[:-2] + ' ' + pron,
'lemma': '* ' + LEMMA[pron],
'upos': '* PRON',
'feats': 'VerbForm=%s *' % verbform,
'deprel': '* iobj',
'main': 0,
'shape': 'subtree',
}
if re.search('l[oe]s$', form):
verbform = node.feats['VerbForm']
if verbform == 'Fin':
return None
del node.feats['VerbForm']
pron = form[-3:]
return {
'form': form[:-3] + ' ' + pron,
'lemma': '* él',
'upos': '* PRON',
'feats': 'VerbForm=%s *' % verbform,
'deprel': '* iobj',
'main': 0,
'shape': 'subtree',
}
# TODO: multiple suffixes, e.g. compratelo = compra + te + lo
return None
# Sometimes "del" has a shape which is neither "siblings" nor "subtree".
# E.g. in "a partir del NOUN"
# "del" = "de el", but
# "de" is attached to "a" (as fixed), while "el" is attached to the NOUN.
[docs]
def postprocess_mwt(self, mwt):
if mwt.form.lower() in {'al', 'del'} and mwt.words[1].parent.precedes(mwt.words[1]):
head = mwt.words[1].next_node
while head.upos not in {'NOUN', 'PROPN'}:
if head.parent.precedes(head) or head.is_root():
head = mwt.words[1].next_node
break
head = head.parent
mwt.words[1].parent = head