Source code for udapi.block.ud.fr.addmwt

"""Block ud.fr.AddMwt for heuristic detection of French contractions.

According to the UD guidelines, contractions such as "des" = "de les"
should be annotated using multi-word tokens.

Note that this block should be used only for converting legacy conllu files.
Ideally a tokenizer should have already split the MWTs.
"""
import udapi.block.ud.addmwt

MWTS = {
    'au':         {'form': 'à le', 'lemma': 'à le'},
    'aux':        {'form': 'à les', 'lemma': 'à le'},
    'des':        {'form': 'de les', 'lemma': 'de le'},
    'du':         {'form': 'de le', 'lemma': 'de le'},

    'auquel':     {'form': 'à lequel', 'upos': 'ADP PRON', 'lemma': 'à lequel'},
    'auxquels':   {'form': 'à lesquels', 'upos': 'ADP PRON', 'lemma': 'à lequel'},
    'auxquelles': {'form': 'à lesquelles', 'upos': 'ADP PRON', 'lemma': 'à lequel'},
    'desquels':   {'form': 'de lesquels', 'upos': 'ADP PRON', 'lemma': 'de lequel'},
    'desquelles': {'form': 'de lesquelles', 'upos': 'ADP PRON', 'lemma': 'de lequel'},
    'duquel':     {'form': 'de lequel', 'upos': 'ADP PRON', 'lemma': 'de lequel'},
}
# TODO https://fr.wiktionary.org/wiki/des#Vocabulaire_apparent.C3.A9_par_le_sens_2
# lists more contractions, e.g. "dudit", "audit"

# shared values for all entries in MWTS
for v in MWTS.values():
    if not v.get('upos'):
        v['upos'] = 'ADP DET'
    if not v.get('shape'):
        v['shape'] = 'subtree'
    if not v.get('deprel'):
        v['deprel'] = 'case det' if v['upos'] == 'ADP DET' else 'case *'
    if not v.get('main'):
        v['main'] = 1 if v['upos'] == 'ADP PRON' else 0
    v['feats'] = '_ *'


[docs] class AddMwt(udapi.block.ud.addmwt.AddMwt): """Detect and mark MWTs (split them into words and add the words to the tree)."""
[docs] def multiword_analysis(self, node): """Return a dict with MWT info or None if `node` does not represent a multiword token.""" # "du" can be # - "du + le" (tagged ADP) # - the partitive article "du" (tagged DET) # - past participle of devoir (correctly dû, tagged VERB) # Only the ADP case should be split. # Similarly with "des" -> "de les". if node.upos != 'ADP': return None return MWTS.get(node.form.lower(), None)
# "du" has a shape which is neither "siblings" nor "subtree" # E.g. in "À partir du XXIe siècle" # "du" = "de le", but # "de" is attached to "À", while "le" is attached to "siècle".
[docs] def postprocess_mwt(self, mwt): if mwt.form.lower() in {'du', 'des', 'au', 'aux'}: if mwt.words[0].descendants[-1] != mwt.words[1]: pass elif mwt.words[0].precedes(mwt.words[0].parent): mwt.words[1].parent = mwt.words[0].parent else: head = mwt.words[1].next_node while head.upos not in {'NOUN', 'PROPN'} and not head.is_root(): if head.parent.precedes(head): head = mwt.words[1].next_node break head = head.parent if head.is_root(): head = mwt.words[1].next_node mwt.words[1].parent = head if mwt.words[1].parent == mwt.words[0] and mwt.words[0].descendants[-1].deprel == 'fixed': mwt.words[1].deprel = 'fixed' if (mwt.words[0].parent.precedes(mwt.words[0]) and mwt.words[0].prev_node.udeprel in {'case', 'fixed'}): mwt.words[0].deprel = 'fixed'