Source code for udapi.block.ud.cs.addmwt

"""Block ud.cs.AddMwt for heuristic detection of multi-word tokens."""
import udapi.block.ud.addmwt

MWTS = {
    'abych': {'form': 'aby bych', 'feats': '_ Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'},
    'kdybych': {'form': 'když bych', 'feats': '_ Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'},
    'abys': {'form': 'aby bys', 'feats': '_ Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'},
    'kdybys': {'form': 'když bys', 'feats': '_ Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'},
    'aby': {'form': 'aby by', 'feats': '_ Mood=Cnd|Person=3|VerbForm=Fin'},
    'kdyby': {'form': 'když by', 'feats': '_ Mood=Cnd|Person=3|VerbForm=Fin'},
    'abychom': {'form': 'aby bychom', 'feats': '_ Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'},
    'kdybychom': {'form': 'když bychom', 'feats': '_ Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'},
    'abyste': {'form': 'aby byste', 'feats': '_ Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'},
    'kdybyste': {'form': 'když byste', 'feats': '_ Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'},
}
for v in MWTS.values():
    v['upos'] = 'SCONJ AUX'
    number = '-'
    if 'Sing' in v['feats']:
        number = 'S'
    elif 'Plur' in v['feats']:
        number = 'P'
    person = '-'
    if 'Person=1' in v['feats']:
        person = '1'
    elif 'Person=2' in v['feats']:
        person = '2'

    v['xpos'] = 'J,------------- Vc-%s---%s-------' % (number, person)
    v['deprel'] = '* aux'
    v['lemma'] = v['form'].split()[0] + ' být'
    v['main'] = 0
    v['shape'] = 'siblings'

# nač -> na + co
for prep in 'na za o'.split():
    MWTS[prep + 'č'] = {
        'form': prep + ' co',
        'lemma': prep + ' co',
        'upos': 'ADP PRON',
        'deprel': 'case *',
        'main': 1,
        'shape': 'subtree',
    }


[docs] class AddMwt(udapi.block.ud.addmwt.AddMwt): """Detect and mark MWTs (split them into words and add the words to the tree)."""
[docs] def multiword_analysis(self, node): """Return a dict with MWT info or None if `node` does not represent a multiword token.""" analysis = MWTS.get(node.form.lower(), None) if analysis is not None: return analysis # There is no VerbType=verbconj in the UD_Czech data. # The purpose of this rule is rather to show that # it is possible to write such "dynamic" rules # (which cannot be included in static MWTS). if node.form.lower().endswith('ť') and node.feats['VerbType'] == 'verbconj': return { 'form': node.form.lower()[:-1] + ' neboť', 'lemma': '* neboť', 'upos': '* CCONJ', 'xpos': 'Vt-S---3P-NA--2 J^-------------', 'feats': '* _', 'deprel': '* cc', 'main': 0, 'shape': 'subtree', } return None
[docs] def postprocess_mwt(self, mwt): if mwt.words[0].deprel == 'fixed' and mwt.words[0].parent.parent.upos == 'VERB': mwt.words[1].parent = mwt.words[0].parent.parent