Source code for udapi.block.ud.cs.addmwt

"""Block ud.cs.AddMwt for heuristic detection of multi-word tokens."""
import udapi.block.ud.addmwt
import re
import logging

# Define static rules for 'aby', 'kdyby' and similar forms.
MWTS = {
    'abych': {'form': 'aby bych', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'},
    'kdybych': {'form': 'když bych', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'},
    'abys': {'form': 'aby bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'},
    'abysi': {'form': 'aby bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'},
    'kdybys': {'form': 'když bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'},
    'kdybysi': {'form': 'když bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'},
    'aby': {'form': 'aby by', 'feats': '_ Aspect=Imp|Mood=Cnd|VerbForm=Fin'},
    'kdyby': {'form': 'když by', 'feats': '_ Aspect=Imp|Mood=Cnd|VerbForm=Fin'},
    # Note: Occasionally we also need to split 'jakoby' but we must not do it whenever we see the string, it must be specifically ordered in the given position!
    'abychom': {'form': 'aby bychom', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'},
    'abysme': {'form': 'aby bysme', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'},
    'kdybychom': {'form': 'když bychom', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'},
    'kdybysme': {'form': 'když bysme', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'},
    # Old Czech 'abychme' == Modern Czech 'abychom'
    'abychme': {'form': 'aby bychme', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'},
    'kdybychme': {'form': 'když bychme', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'},
    'abyste': {'form': 'aby byste', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'},
    'abyšte': {'form': 'aby byšte', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'},
    'kdybyste': {'form': 'když byste', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'},
    'kdybyšte': {'form': 'když byšte', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'},
    # Old Czech 'abyšta' == dual number; 2nd or 3rd person, the one example in data so far is 3rd.
    'abyšta': {'form': 'aby byšta', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Dual|Person=3|VerbForm=Fin'},
    'kdybyšta': {'form': 'když byšta', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Dual|Person=3|VerbForm=Fin'},
}
for v in MWTS.values():
    v['upos'] = 'SCONJ AUX'
    number = '-'
    if 'Sing' in v['feats']:
        number = 'S'
    elif 'Plur' in v['feats']:
        number = 'P'
    person = '-'
    if 'Person=1' in v['feats']:
        person = '1'
    elif 'Person=2' in v['feats']:
        person = '2'
    v['xpos'] = 'J,------------- Vc-%s---%s-------' % (number, person)
    v['deprel'] = '* aux'
    v['lemma'] = v['form'].split()[0] + ' být'
    v['main'] = 0
    v['shape'] = 'siblings'

# Define static rules for 'nač', 'oč', 'zač' (but not 'proč').
# Add them to the already existing dictionary MWTS.
# nač -> na + co
for prep in 'na o za'.split():
    MWTS[prep + 'č'] = {
        'form': prep + ' co',
        'lemma': prep + ' co',
        'upos': 'ADP PRON',
        'xpos': 'RR--4---------- PQ--4----------',
        'feats': 'AdpType=Prep|Case=Acc Animacy=Inan|Case=Acc|PronType=Int,Rel',
        'deprel': 'case *',
        'main': 1,
        'shape': 'subtree',
    }
# In 19th century texts (Hičkok etalon), one instance of 'seč' was also split (and annotated as ADP + accusative!)
# A few additional instances were found in older texts, too (e.g. 16th century).
# We must do it separately, as the preposition is vocalized.
MWTS['seč'] = {
    'form': 'se' + ' co',
    'lemma': 's' + ' co',
    'upos': 'ADP PRON',
    'xpos': 'RV--4---------- PQ--4----------',
    'feats': 'AdpType=Voc|Case=Acc Animacy=Inan|Case=Acc|PronType=Int,Rel',
    'deprel': 'case *',
    'main': 1,
    'shape': 'subtree',
}
# In 17th century texts, 'načež' sometimes should be interpreted really as
# relative pronoun with preposition, not as adverb meaning "thereafter".
MWTS['načež'] = {
    'form': 'na' + ' což',
    'lemma': 'na' + ' co',
    'upos': 'ADP PRON',
    'xpos': 'RV--4---------- PE--4----------',
    'feats': 'AdpType=Prep|Case=Acc Animacy=Inan|Case=Acc|PronType=Rel',
    'deprel': 'case *',
    'main': 1,
    'shape': 'subtree',
}

# Old Czech 'takliž'.
MWTS['takliž'] = {
    'form':   'tak liž',
    'lemma':  'tak li',
    'upos':   'ADV SCONJ',
    'xpos':   'Db------------- J,-------------',
    'feats':  'PronType=Dem Emph=Yes',
    'deprel': 'advmod mark',
    'main':   0,
    'shape':  'siblings'
}

# Old Czech 'toliť' (special case with 3 subtokens; general -ť will be solved dynamically below).
MWTS['toliť'] = {
    'form':   'to li ť',
    'lemma':  'ten li ť',
    'upos':   'DET SCONJ PART',
    'xpos':   '* J,------------- TT-------------',
    'feats':  '* _ _',
    'deprel': '* mark discourse',
    'main':   0,
    'shape':  'siblings'
}



[docs] class AddMwt(udapi.block.ud.addmwt.AddMwt): """Detect and mark MWTs (split them into words and add the words to the tree)."""
[docs] def multiword_analysis(self, node): """Return a dict with MWT info or None if `node` does not represent a multiword token.""" # Avoid adding a MWT if the current node already is part of an MWT. if node.multiword_token: return None analysis = MWTS.get(node.form.lower(), None) if analysis is not None: node.misc['AddMwt'] = '' return analysis # If the node did not match any of the static rules defined in MWTS, # check it against the "dynamic" rules below. The enclitic 'ť' will be # separated from its host but only if it has been marked by an annotator # in MISC. (These are annotation conventions used for Old Czech in the # Hičkok project.) if node.misc['AddMwt'] != '': subtokens = node.misc['AddMwt'].split() if len(subtokens) != 2: logging.warning("MISC 'AddMwt=%s' has unexpected number of subtokens." % node.misc['AddMwt']) return None token_from_subtokens = ''.join(subtokens) # The patterns with -by, -bych... (aby, kdyby) are mostly solved above, # but the exception is 'jakoby', where we need instruction at specific # position, otherwise we do not split it. if subtokens[1] == 'by': node.misc['AddMwt'] = '' return { 'form': subtokens[0] + ' by', 'lemma': '* být', 'upos': '* AUX', 'xpos': '* Vc-------------', 'feats': '* Aspect=Imp|Mood=Cnd|VerbForm=Fin', 'deprel': '* aux', 'main': 0, 'shape': 'siblings', } if subtokens[1] == 'jsi': node.misc['AddMwt'] = '' return { 'form': subtokens[0] + ' jsi', 'lemma': '* být', 'upos': '* AUX', 'xpos': '* VB-S---2P-AAI--', 'feats': '* Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin|Voice=Act', 'deprel': '* aux', 'main': 0, 'shape': 'subtree' if node.upos in ['VERB'] else 'siblings', } if subtokens[1] == 'jest': node.misc['AddMwt'] = '' return { 'form': subtokens[0] + ' jest', 'lemma': '* být', 'upos': '* AUX', 'xpos': '* VB-S---3P-AAI-2', 'feats': '* Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin|Voice=Act', 'deprel': '* aux', 'main': 0, 'shape': 'subtree' if node.upos in ['VERB'] else 'siblings', } if subtokens[1] == 'i': node.misc['AddMwt'] = '' return { 'form': subtokens[0] + ' i', 'lemma': '* i', 'upos': '* CCONJ', 'xpos': '* J^-------------', 'feats': '* _', 'deprel': '* cc', 'main': 0, 'shape': 'subtree', } if subtokens[1] in ['ť', 'tě', 'ti', 't']: if token_from_subtokens != node.form: logging.warning("Concatenation of MISC 'AddMwt=%s' does not yield the FORM '%s'." % (node.misc['AddMwt'], node.form)) return None node.misc['AddMwt'] = '' return { 'form': subtokens[0] + ' ' + subtokens[1], 'lemma': '* ť', 'upos': '* PART', 'xpos': '* TT-------------', 'feats': '* _', 'deprel': '* discourse', 'main': 0, 'shape': 'subtree', } # dajžto = dajž + to if subtokens[1] == 'to': if token_from_subtokens != node.form: logging.warning("Concatenation of MISC 'AddMwt=%s' does not yield the FORM '%s'." % (node.misc['AddMwt'], node.form)) return None node.misc['AddMwt'] = '' return { 'form': subtokens[0] + ' ' + subtokens[1], 'lemma': '* ten', 'upos': '* DET', 'xpos': '* PDNS4----------', 'feats': '* Case=Acc|Gender=Neut|Number=Sing|PronType=Dem', 'deprel': '* obj', 'main': 0, 'shape': 'subtree', } # Contractions of prepositions and pronouns almost could be processed # regardless of AddMwt instructions by the annotator, but we still # require it to be on the safe side. For example, both 'přědeň' and # 'přěden' are attested in Old Czech but then we do not want to catch # 'on' (besides the wanted 'oň'). Another reason si that the pronoun # could be masculine or neuter. We pick Gender=Masc and Animacy=Anim # by default, unless the original token was annotated as Animacy=Inan # or Gender=Neut. m = re.match(r"^(na|nade|o|po|pro|přěde|ski?rz[eě]|za)[nň](ž?)$", node.form.lower()) if m: node.misc['AddMwt'] = '' # Remove vocalization from 'přěde' (přěd něj) but keep it in 'skrze' # (skrze něj). if m.group(1) == 'přěde': pform = 'přěd' plemma = 'před' adptype = 'Voc' at = 'V' elif re.match(r"^ski?rz[eě]$", m.group(1).lower()): pform = m.group(1) plemma = 'skrz' adptype = 'Voc' at = 'V' else: pform = m.group(1) plemma = m.group(1) adptype = 'Prep' at = 'R' # In UD PDT, Gender=Masc,Neut, and in PDT it is PEZS4--3 / P4ZS4---. if node.feats['Gender'] == 'Neut': gender = 'Neut' animacy = '' g = 'N' elif node.feats['Animacy'] == 'Inan': gender = 'Masc' animacy = 'Animacy=Inan|' g = 'I' else: gender = 'Masc' animacy = 'Animacy=Anim|' g = 'M' if m.group(2).lower() == 'ž': return { 'form': pform + ' nějž', 'lemma': plemma + ' jenž', 'upos': 'ADP PRON', 'xpos': 'R'+at+'--4---------- P4'+g+'S4---------2', 'feats': 'AdpType='+adptype+'|Case=Acc '+animacy+'Case=Acc|Gender='+gender+'|Number=Sing|PrepCase=Pre|PronType=Rel', 'deprel': 'case *', 'main': 1, 'shape': 'subtree', } else: return { 'form': pform + ' něj', 'lemma': plemma + ' on', 'upos': 'ADP PRON', 'xpos': 'R'+at+'--4---------- PE'+g+'S4--3-------', 'feats': 'AdpType='+adptype+'|Case=Acc '+animacy+'Case=Acc|Gender='+gender+'|Number=Sing|Person=3|PrepCase=Pre|PronType=Prs', 'deprel': 'case *', 'main': 1, 'shape': 'subtree', } return None
[docs] def postprocess_mwt(self, mwt): if mwt.words[0].deprel == 'fixed' and mwt.words[0].parent.parent.upos == 'VERB': mwt.words[1].parent = mwt.words[0].parent.parent