Source code for udapi.block.ud.cs.addmwt

"""Block ud.cs.AddMwt for heuristic detection of multi-word tokens."""
import udapi.block.ud.addmwt
import re
import logging

# Define static rules for 'aby', 'kdyby' and similar forms.
MWTS = {
    'abych': {'form': 'aby bych', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'},
    'kdybych': {'form': 'když bych', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'},
    'abys': {'form': 'aby bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'},
    'abysi': {'form': 'aby bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'},
    'kdybys': {'form': 'když bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'},
    'kdybysi': {'form': 'když bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'},
    'aby': {'form': 'aby by', 'feats': '_ Aspect=Imp|Mood=Cnd|VerbForm=Fin'},
    'kdyby': {'form': 'když by', 'feats': '_ Aspect=Imp|Mood=Cnd|VerbForm=Fin'},
    # Note: Occasionally we also need to split 'jakoby' but we must not do it whenever we see the string, it must be specifically ordered in the given position!
    'abychom': {'form': 'aby bychom', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'},
    'abysme': {'form': 'aby bysme', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'},
    'kdybychom': {'form': 'když bychom', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'},
    'kdybysme': {'form': 'když bysme', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'},
    # Old Czech 'abychme' == Modern Czech 'abychom'
    'abychme': {'form': 'aby bychme', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'},
    'kdybychme': {'form': 'když bychme', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'},
    'abyste': {'form': 'aby byste', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'},
    'abyšte': {'form': 'aby byšte', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'},
    'kdybyste': {'form': 'když byste', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'},
    'kdybyšte': {'form': 'když byšte', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'},
    # Old Czech 'abyšta' == dual number; 2nd or 3rd person, the one example in data so far is 3rd.
    'abyšta': {'form': 'aby byšta', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Dual|Person=3|VerbForm=Fin'},
    'kdybyšta': {'form': 'když byšta', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Dual|Person=3|VerbForm=Fin'},
}
for v in MWTS.values():
    v['upos'] = 'SCONJ AUX'
    number = '-'
    if 'Sing' in v['feats']:
        number = 'S'
    elif 'Plur' in v['feats']:
        number = 'P'
    person = '-'
    if 'Person=1' in v['feats']:
        person = '1'
    elif 'Person=2' in v['feats']:
        person = '2'
    v['xpos'] = 'J,------------- Vc-%s---%s-------' % (number, person)
    v['deprel'] = '* aux'
    v['lemma'] = v['form'].split()[0] + ' být'
    v['main'] = 0
    v['shape'] = 'siblings'

# Define static rules for 'nač', 'oč', 'zač' (but not 'proč').
# Add them to the already existing dictionary MWTS.
# nač -> na + co
for prep in 'na o za'.split():
    MWTS[prep + 'č'] = {
        'form': prep + ' co',
        'lemma': prep + ' co',
        'upos': 'ADP PRON',
        'xpos': 'RR--4---------- PQ--4----------',
        'feats': 'AdpType=Prep|Case=Acc Animacy=Inan|Case=Acc|PronType=Int,Rel',
        'deprel': 'case *',
        'main': 1,
        'shape': 'subtree',
    }
# In 19th century texts (Hičkok etalon), one instance of 'seč' was also split (and annotated as ADP + accusative!)
# A few additional instances were found in older texts, too (e.g. 16th century).
# We must do it separately, as the preposition is vocalized.
MWTS['seč'] = {
    'form': 'se' + ' co',
    'lemma': 's' + ' co',
    'upos': 'ADP PRON',
    'xpos': 'RV--4---------- PQ--4----------',
    'feats': 'AdpType=Voc|Case=Acc Animacy=Inan|Case=Acc|PronType=Int,Rel',
    'deprel': 'case *',
    'main': 1,
    'shape': 'subtree',
}
# In 17th century texts, 'načež' sometimes should be interpreted really as
# relative pronoun with preposition, not as adverb meaning "thereafter".
MWTS['načež'] = {
    'form': 'na' + ' což',
    'lemma': 'na' + ' co',
    'upos': 'ADP PRON',
    'xpos': 'RV--4---------- PE--4----------',
    'feats': 'AdpType=Prep|Case=Acc Animacy=Inan|Case=Acc|PronType=Rel',
    'deprel': 'case *',
    'main': 1,
    'shape': 'subtree',
}

# Old Czech 'takliž'.
MWTS['takliž'] = {
    'form':   'tak liž',
    'lemma':  'tak li',
    'upos':   'ADV SCONJ',
    'xpos':   'Db------------- J,-------------',
    'feats':  'PronType=Dem Emph=Yes',
    'deprel': 'advmod mark',
    'main':   0,
    'shape':  'siblings'
}

# Old Czech 'toliť' (special case with 3 subtokens; general -ť will be solved dynamically below).
MWTS['toliť'] = {
    'form':   'to li ť',
    'lemma':  'ten li ť',
    'upos':   'DET SCONJ PART',
    'xpos':   '* J,------------- TT-------------',
    'feats':  '* _ _',
    'deprel': '* mark discourse',
    'main':   0,
    'shape':  'siblings'
}




[docs]
class AddMwt(udapi.block.ud.addmwt.AddMwt):
    """Detect and mark MWTs (split them into words and add the words to the tree)."""


[docs]
    def multiword_analysis(self, node):
        """Return a dict with MWT info or None if `node` does not represent a multiword token."""
        # Avoid adding a MWT if the current node already is part of an MWT.
        if node.multiword_token:
            return None
        analysis = MWTS.get(node.form.lower(), None)
        if analysis is not None:
            node.misc['AddMwt'] = ''
            return analysis
        # If the node did not match any of the static rules defined in MWTS,
        # check it against the "dynamic" rules below. The enclitic 'ť' will be
        # separated from its host but only if it has been marked by an annotator
        # in MISC. (These are annotation conventions used for Old Czech in the
        # Hičkok project.)
        if node.misc['AddMwt'] != '':
            subtokens = node.misc['AddMwt'].split()
            if len(subtokens) != 2:
                logging.warning("MISC 'AddMwt=%s' has unexpected number of subtokens." % node.misc['AddMwt'])
                return None
            token_from_subtokens = ''.join(subtokens)
            # The patterns with -by, -bych... (aby, kdyby) are mostly solved above,
            # but the exception is 'jakoby', where we need instruction at specific
            # position, otherwise we do not split it.
            if subtokens[1] == 'by':
                node.misc['AddMwt'] = ''
                return {
                    'form':   subtokens[0] + ' by',
                    'lemma':  '* být',
                    'upos':   '* AUX',
                    'xpos':   '* Vc-------------',
                    'feats':  '* Aspect=Imp|Mood=Cnd|VerbForm=Fin',
                    'deprel': '* aux',
                    'main':   0,
                    'shape':  'siblings',
                }
            if subtokens[1] == 'jsi':
                node.misc['AddMwt'] = ''
                return {
                    'form':   subtokens[0] + ' jsi',
                    'lemma':  '* být',
                    'upos':   '* AUX',
                    'xpos':   '* VB-S---2P-AAI--',
                    'feats':  '* Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin|Voice=Act',
                    'deprel': '* aux',
                    'main':   0,
                    'shape':  'subtree' if node.upos in ['VERB'] else 'siblings',
                }
            if subtokens[1] == 'jest':
                node.misc['AddMwt'] = ''
                return {
                    'form':   subtokens[0] + ' jest',
                    'lemma':  '* být',
                    'upos':   '* AUX',
                    'xpos':   '* VB-S---3P-AAI-2',
                    'feats':  '* Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin|Voice=Act',
                    'deprel': '* aux',
                    'main':   0,
                    'shape':  'subtree' if node.upos in ['VERB'] else 'siblings',
                }
            if subtokens[1] == 'i':
                node.misc['AddMwt'] = ''
                return {
                    'form':   subtokens[0] + ' i',
                    'lemma':  '* i',
                    'upos':   '* CCONJ',
                    'xpos':   '* J^-------------',
                    'feats':  '* _',
                    'deprel': '* cc',
                    'main':   0,
                    'shape': 'subtree',
                }
            if subtokens[1] in ['ť', 'tě', 'ti', 't']:
                if token_from_subtokens != node.form:
                    logging.warning("Concatenation of MISC 'AddMwt=%s' does not yield the FORM '%s'." % (node.misc['AddMwt'], node.form))
                    return None
                node.misc['AddMwt'] = ''
                return {
                    'form':   subtokens[0] + ' ' + subtokens[1],
                    'lemma':  '* ť',
                    'upos':   '* PART',
                    'xpos':   '* TT-------------',
                    'feats':  '* _',
                    'deprel': '* discourse',
                    'main':   0,
                    'shape':  'subtree',
                }
            # dajžto = dajž + to
            if subtokens[1] == 'to':
                if token_from_subtokens != node.form:
                    logging.warning("Concatenation of MISC 'AddMwt=%s' does not yield the FORM '%s'." % (node.misc['AddMwt'], node.form))
                    return None
                node.misc['AddMwt'] = ''
                return {
                    'form':   subtokens[0] + ' ' + subtokens[1],
                    'lemma':  '* ten',
                    'upos':   '* DET',
                    'xpos':   '* PDNS4----------',
                    'feats':  '* Case=Acc|Gender=Neut|Number=Sing|PronType=Dem',
                    'deprel': '* obj',
                    'main':   0,
                    'shape':  'subtree',
                }
            # Contractions of prepositions and pronouns almost could be processed
            # regardless of AddMwt instructions by the annotator, but we still
            # require it to be on the safe side. For example, both 'přědeň' and
            # 'přěden' are attested in Old Czech but then we do not want to catch
            # 'on' (besides the wanted 'oň'). Another reason si that the pronoun
            # could be masculine or neuter. We pick Gender=Masc and Animacy=Anim
            # by default, unless the original token was annotated as Animacy=Inan
            # or Gender=Neut.
            m = re.match(r"^(na|nade|o|po|pro|přěde|ski?rz[eě]|za)[nň](ž?)$", node.form.lower())
            if m:
                node.misc['AddMwt'] = ''
                # Remove vocalization from 'přěde' (přěd něj) but keep it in 'skrze'
                # (skrze něj).
                if m.group(1) == 'přěde':
                    pform = 'přěd'
                    plemma = 'před'
                    adptype = 'Voc'
                    at = 'V'
                elif re.match(r"^ski?rz[eě]$", m.group(1).lower()):
                    pform = m.group(1)
                    plemma = 'skrz'
                    adptype = 'Voc'
                    at = 'V'
                else:
                    pform = m.group(1)
                    plemma = m.group(1)
                    adptype = 'Prep'
                    at = 'R'
                # In UD PDT, Gender=Masc,Neut, and in PDT it is PEZS4--3 / P4ZS4---.
                if node.feats['Gender'] == 'Neut':
                    gender = 'Neut'
                    animacy = ''
                    g = 'N'
                elif node.feats['Animacy'] == 'Inan':
                    gender = 'Masc'
                    animacy = 'Animacy=Inan|'
                    g = 'I'
                else:
                    gender = 'Masc'
                    animacy = 'Animacy=Anim|'
                    g = 'M'
                if m.group(2).lower() == 'ž':
                    return {
                        'form': pform + ' nějž',
                        'lemma': plemma + ' jenž',
                        'upos': 'ADP PRON',
                        'xpos': 'R'+at+'--4---------- P4'+g+'S4---------2',
                        'feats': 'AdpType='+adptype+'|Case=Acc '+animacy+'Case=Acc|Gender='+gender+'|Number=Sing|PrepCase=Pre|PronType=Rel',
                        'deprel': 'case *',
                        'main': 1,
                        'shape': 'subtree',
                    }
                else:
                    return {
                        'form': pform + ' něj',
                        'lemma': plemma + ' on',
                        'upos': 'ADP PRON',
                        'xpos': 'R'+at+'--4---------- PE'+g+'S4--3-------',
                        'feats': 'AdpType='+adptype+'|Case=Acc '+animacy+'Case=Acc|Gender='+gender+'|Number=Sing|Person=3|PrepCase=Pre|PronType=Prs',
                        'deprel': 'case *',
                        'main': 1,
                        'shape': 'subtree',
                    }
        return None



[docs]
    def postprocess_mwt(self, mwt):
        if mwt.words[0].deprel == 'fixed' and mwt.words[0].parent.parent.upos == 'VERB':
            mwt.words[1].parent = mwt.words[0].parent.parent