Source code for udapi.block.ud.ca.addmwt

"""Block ud.ca.AddMwt for heuristic detection of Catalan contractions.

According to the UD guidelines, contractions such as "del" = "de el"
should be annotated using multi-word tokens.

Note that this block should be used only for converting legacy conllu files.
Ideally a tokenizer should have already split the MWTs.
"""
import re
import udapi.block.ud.addmwt

MWTS = {
    'al':      {'form': 'a el',    'lemma': 'a el',   'feats': '_ Definite=Def|Gender=Masc|Number=Sing|PronType=Art'},
    'als':     {'form': 'a els',   'lemma': 'a el',   'feats': '_ Definite=Def|Gender=Masc|Number=Plur|PronType=Art'},
    'del':     {'form': 'de el',   'lemma': 'de el',  'feats': '_ Definite=Def|Gender=Masc|Number=Sing|PronType=Art'},
    'dels':    {'form': 'de els',  'lemma': 'de el',  'feats': '_ Definite=Def|Gender=Masc|Number=Plur|PronType=Art'},
    'pel':     {'form': 'per el',  'lemma': 'per el', 'feats': '_ Definite=Def|Gender=Masc|Number=Sing|PronType=Art'},
    'pels':    {'form': 'per els', 'lemma': 'per el', 'feats': '_ Definite=Def|Gender=Masc|Number=Plur|PronType=Art'},
}

# shared values for all entries in MWTS
for v in MWTS.values():
    v['lemma'] = v['form']
    v['upos'] = 'ADP DET'
    v['deprel'] = '* det'
    # The following are the default values
    # v['main'] = 0 # which of the two words will inherit the original children (if any)
    # v['shape'] = 'siblings', # the newly created nodes will be siblings



[docs]
class AddMwt(udapi.block.ud.addmwt.AddMwt):
    """Detect and mark MWTs (split them into words and add the words to the tree)."""

    def __init__(self, verbpron=False, **kwargs):
        super().__init__(**kwargs)
        self.verbpron = verbpron


[docs]
    def multiword_analysis(self, node):
        """Return a dict with MWT info or None if `node` does not represent a multiword token."""
        analysis = MWTS.get(node.form.lower(), None)

        if analysis is not None:
            # Modify the default attachment of the new syntactic words in special situations.
            if re.match(r'^(root|conj|reparandum)$', node.udeprel):
                # Copy the dictionary so that we do not modify the original and do not affect subsequent usages.
                analysis = analysis.copy()
                analysis['shape'] = 'subtree'
            return analysis
        return None



[docs]
    def fix_personal_pronoun(self, node):
        # There is a mess in lemmas and features of personal pronouns.
        if node.upos == 'PRON':
            if re.match("^jo$", node.form, re.IGNORECASE):
                node.lemma = 'jo'
                node.feats = 'Case=Nom|Number=Sing|Person=1|PronType=Prs'
            if re.match("^(em|m'|-me|'m|me|m)$", node.form, re.IGNORECASE):
                node.lemma = 'jo'
                node.feats = 'Case=Acc,Dat|Number=Sing|Person=1|PrepCase=Npr|PronType=Prs'
            if re.match("^mi$", node.form, re.IGNORECASE):
                node.lemma = 'jo'
                node.feats = 'Case=Acc|Number=Sing|Person=1|PrepCase=Pre|PronType=Prs'
            if re.match("^tu$", node.form, re.IGNORECASE):
                node.lemma = 'tu'
                node.feats = 'Case=Nom|Number=Sing|Person=2|Polite=Infm|PronType=Prs'
            if re.match("^(et|t'|-te|'t|te|t)$", node.form, re.IGNORECASE):
                node.lemma = 'tu'
                node.feats = 'Case=Acc,Dat|Number=Sing|Person=2|Polite=Infm|PrepCase=Npr|PronType=Prs'
            if re.match("^ti$", node.form, re.IGNORECASE):
                node.lemma = 'tu'
                node.feats = 'Case=Acc|Number=Sing|Person=2|Polite=Infm|PrepCase=Pre|PronType=Prs'
            # Strong forms of third person pronouns can be used as subjects or after preposition.
            # Do not mark them as nominative (because of the prepositions).
            if re.match("^ell$", node.form, re.IGNORECASE):
                node.lemma = 'ell'
                node.feats = 'Gender=Masc|Number=Sing|Person=3|PronType=Prs'
            if re.match("^ella$", node.form, re.IGNORECASE):
                node.lemma = 'ell'
                node.feats = 'Gender=Fem|Number=Sing|Person=3|PronType=Prs'
            if re.match("^(el|-lo|'l|lo)$", node.form, re.IGNORECASE):
                node.lemma = 'ell'
                node.feats = 'Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs'
            if re.match("^(la|-la)$", node.form, re.IGNORECASE):
                node.lemma = 'ell'
                node.feats = 'Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs'
            if re.match("^(l')$", node.form, re.IGNORECASE):
                node.lemma = 'ell'
                node.feats = 'Case=Acc|Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs'
            if re.match("^(ho|-ho)$", node.form, re.IGNORECASE):
                node.lemma = 'ell'
                node.feats = 'Case=Acc|Gender=Neut|Number=Sing|Person=3|PronType=Prs'
            if re.match("^(li|-li)$", node.form, re.IGNORECASE):
                node.lemma = 'ell'
                node.feats = 'Case=Dat|Number=Sing|Person=3|PronType=Prs'
            if re.match("^(es|s'|-se|'s|se|s)$", node.form, re.IGNORECASE):
                node.lemma = 'ell'
                node.feats = 'Case=Acc,Dat|Person=3|PrepCase=Npr|PronType=Prs|Reflex=Yes'
            if re.match("^si$", node.form, re.IGNORECASE):
                node.lemma = 'ell'
                node.feats = 'Case=Acc|Person=3|PrepCase=Pre|PronType=Prs|Reflex=Yes'
            # If nosaltres can be used after a preposition, we should not tag it as nominative.
            if re.match("^nosaltres$", node.form, re.IGNORECASE):
                node.lemma = 'jo'
                node.feats = 'Number=Plur|Person=1|PronType=Prs'
            # Nós is the majestic first person singular. In accusative and dative, it is identical to first person plural.
            if re.match("^nós$", node.form, re.IGNORECASE):
                node.lemma = 'jo'
                node.feats = 'Number=Sing|Person=1|Polite=Form|PronType=Prs'
            if re.match("^(ens|-nos|'ns|nos|ns)$", node.form, re.IGNORECASE):
                node.lemma = 'jo'
                node.feats = 'Case=Acc,Dat|Number=Plur|Person=1|PronType=Prs'
            if re.match("^vosaltres$", node.form, re.IGNORECASE):
                node.lemma = 'tu'
                node.feats = 'Number=Plur|Person=2|PronType=Prs'
            # Vós is the formal second person singular. In accusative and dative, it is identical to second person plural.
            # Vostè is even more formal than vós. In accusative and dative, it is identical to third person singular.
            if re.match("^(vós|vostè)$", node.form, re.IGNORECASE):
                node.lemma = 'tu'
                node.feats = 'Number=Sing|Person=2|Polite=Form|PronType=Prs'
            if re.match("^vostès$", node.form, re.IGNORECASE):
                node.lemma = 'tu'
                node.feats = 'Number=Plur|Person=2|Polite=Form|PronType=Prs'
            if re.match("^(us|-vos|-us|vos)$", node.form, re.IGNORECASE):
                node.lemma = 'tu'
                node.feats = 'Case=Acc,Dat|Number=Plur|Person=2|PronType=Prs'
            # Strong forms of third person pronouns can be used as subjects or after preposition.
            # Do not mark them as nominative (because of the prepositions).
            if re.match("^ells$", node.form, re.IGNORECASE):
                node.lemma = 'ell'
                node.feats = 'Gender=Masc|Number=Plur|Person=3|PronType=Prs'
            if re.match("^elles$", node.form, re.IGNORECASE):
                node.lemma = 'ell'
                node.feats = 'Gender=Fem|Number=Plur|Person=3|PronType=Prs'
            # Els is masculine accusative, or dative in any gender.
            if re.match("^(els|-los|'ls|los|ls)$", node.form, re.IGNORECASE):
                node.lemma = 'ell'
                node.feats = 'Case=Acc,Dat|Number=Plur|Person=3|PronType=Prs'
            if re.match("^(les|-les)$", node.form, re.IGNORECASE):
                node.lemma = 'ell'
                node.feats = 'Case=Acc|Gender=Fem|Number=Plur|Person=3|PronType=Prs'
            # There are also "adverbial" pronominal clitics that can occur at direct object positions.
            if re.match("^(en|n'|'n|-ne|n|ne)$", node.form, re.IGNORECASE):
                node.lemma = 'en'
                node.feats = 'Case=Gen|Person=3|PronType=Prs'
            if re.match("^(hi|-hi)$", node.form, re.IGNORECASE):
                node.lemma = 'hi'
                node.feats = 'Case=Loc|Person=3|PronType=Prs'



[docs]
    def report_suspicious_lemmas(self, node):
        # There are offset issues of splitted multi_word_expressions.
        # Sometimes a word gets the lemma of the neighboring word.
        if node.form.lower()[:1] != node.lemma.lower()[:1]:
            # Exclude legitimate cases where the lemma starts with a different letter.
            hit = True
            if node.lemma == 'jo' and re.match("(em|ens|m'|me|mi|nos|nosaltres|'ns)", node.form, re.IGNORECASE):
                hit = False
            if node.lemma == 'tu' and re.match("(et|'t|us|vosaltres|vostè)", node.form, re.IGNORECASE):
                hit = False
            if node.lemma == 'el' and re.match("(la|l|l'|les)", node.form, re.IGNORECASE):
                hit = False
            if node.lemma == 'ell' and re.match("(hi|ho|'l|l'|la|-la|les|li|lo|-lo|los|'ls|'s|s'|se|-se|si)", node.form, re.IGNORECASE):
                hit = False
            if node.lemma == 'es' and re.match("(s|se)", node.form, re.IGNORECASE):
                hit = False
            if node.lemma == 'em' and re.match("('m|m|m')", node.form, re.IGNORECASE):
                hit = False
            if node.lemma == 'en' and re.match("('n|n'|ne|-ne)", node.form, re.IGNORECASE):
                hit = False
            if node.lemma == 'anar' and re.match("(va|van|vàrem)", node.form, re.IGNORECASE):
                hit = False
            if node.lemma == 'ser' and re.match("(és|era|eren|eres|érem|essent|estat|ets|foren|fos|fossin|fou)", node.form, re.IGNORECASE):
                hit = False
            if node.lemma == 'estar' and re.match("(sigut)", node.form, re.IGNORECASE):
                hit = False
            if node.lemma == 'caure' and re.match("(queia|queies|quèiem|quèieu|queien)", node.form, re.IGNORECASE):
                hit = False
            if node.lemma == 'ampli' and re.match("(àmplia|àmplies)", node.form, re.IGNORECASE):
                hit = False
            if node.lemma == 'indi' and re.match("(índies)", node.form, re.IGNORECASE):
                hit = False
            if node.lemma == 'obvi' and re.match("(òbvia)", node.form, re.IGNORECASE):
                hit = False
            if node.lemma == 'ossi' and re.match("(òssies)", node.form, re.IGNORECASE):
                hit = False
            if node.lemma == 'ús' and re.match("(usos)", node.form, re.IGNORECASE):
                hit = False
            # Form = '2001/37/CE', lemma = 'CE'
            # Form = 'nº5', lemma = '5'
            # Form = 'kg.', lemma = 'quilogram'
            # Form = 'un', lemma = '1'
            if node.lemma == 'CE' or re.match("nº", node.form, re.IGNORECASE) or re.match("^quil[oò]", node.lemma, re.IGNORECASE) or re.match("^[0-9]+$", node.lemma):
                hit = False
            if hit:
                print("Form = '%s', lemma = '%s', address = %s" % (node.form, node.lemma, node.address()))