Source code for udapi.block.udpipe.base

"""Block udpipe.Base for tagging and parsing using UDPipe."""
from udapi.core.block import Block
from udapi.tool.udpipe import UDPipe
from udapi.tool.udpipeonline import UDPipeOnline
from udapi.core.bundle import Bundle

KNOWN_MODELS = {
    'af': 'models/udpipe/2.4/afrikaans-afribooms-ud-2.4-190531.udpipe',
    'af_afribooms': 'models/udpipe/2.4/afrikaans-afribooms-ud-2.4-190531.udpipe',
    'grc': 'models/udpipe/2.4/ancient_greek-perseus-ud-2.4-190531.udpipe',
    'grc_perseus': 'models/udpipe/2.4/ancient_greek-perseus-ud-2.4-190531.udpipe',
    'grc_proiel': 'models/udpipe/2.4/ancient_greek-proiel-ud-2.4-190531.udpipe',
    'ar': 'models/udpipe/2.4/arabic-padt-ud-2.4-190531.udpipe',
    'ar_padt': 'models/udpipe/2.4/arabic-padt-ud-2.4-190531.udpipe',
    'hy': 'models/udpipe/2.4/armenian-armtdp-ud-2.4-190531.udpipe',
    'hy_armtdp': 'models/udpipe/2.4/armenian-armtdp-ud-2.4-190531.udpipe',
    'eu': 'models/udpipe/2.4/basque-bdt-ud-2.4-190531.udpipe',
    'eu_bdt': 'models/udpipe/2.4/basque-bdt-ud-2.4-190531.udpipe',
    'be': 'models/udpipe/2.4/belarusian-hse-ud-2.4-190531.udpipe',
    'be_hse': 'models/udpipe/2.4/belarusian-hse-ud-2.4-190531.udpipe',
    'bg': 'models/udpipe/2.4/bulgarian-btb-ud-2.4-190531.udpipe',
    'bg_btb': 'models/udpipe/2.4/bulgarian-btb-ud-2.4-190531.udpipe',
    'ca': 'models/udpipe/2.4/catalan-ancora-ud-2.4-190531.udpipe',
    'ca_ancora': 'models/udpipe/2.4/catalan-ancora-ud-2.4-190531.udpipe',
    'zh': 'models/udpipe/2.4/chinese-gsd-ud-2.4-190531.udpipe',
    'zh_gsd': 'models/udpipe/2.4/chinese-gsd-ud-2.4-190531.udpipe',
    'lzh': 'models/udpipe/2.4/classical_chinese-kyoto-ud-2.4-190531.udpipe',
    'lzh_kyoto': 'models/udpipe/2.4/classical_chinese-kyoto-ud-2.4-190531.udpipe',
    'cop': 'models/udpipe/2.4/coptic-scriptorium-ud-2.4-190531.udpipe',
    'cop_scriptotium': 'models/udpipe/2.4/coptic-scriptorium-ud-2.4-190531.udpipe',
    'hr': 'models/udpipe/2.4/croatian-set-ud-2.4-190531.udpipe',
    'hr_set': 'models/udpipe/2.4/croatian-set-ud-2.4-190531.udpipe',
    'cs': 'models/udpipe/2.4/czech-pdt-ud-2.4-190531.udpipe',
    'cs_pdt': 'models/udpipe/2.4/czech-pdt-ud-2.4-190531.udpipe',
    'cs_cac': 'models/udpipe/2.4/czech-cac-ud-2.4-190531.udpipe',
    'cs_cltt': 'models/udpipe/2.4/czech-cltt-ud-2.4-190531.udpipe',
    'cs_fictree': 'models/udpipe/2.4/czech-fictree-ud-2.4-190531.udpipe',
    'da': 'models/udpipe/2.4/danish-ddt-ud-2.4-190531.udpipe',
    'da_ddt': 'models/udpipe/2.4/danish-ddt-ud-2.4-190531.udpipe',
    'nl': 'models/udpipe/2.4/dutch-alpino-ud-2.4-190531.udpipe',
    'nl_alpino': 'models/udpipe/2.4/dutch-alpino-ud-2.4-190531.udpipe',
    'nl_lassysmall': 'models/udpipe/2.4/dutch-lassysmall-ud-2.4-190531.udpipe',
    'en': 'models/udpipe/2.4/english-ewt-ud-2.4-190531.udpipe',
    'en_ewt': 'models/udpipe/2.4/english-ewt-ud-2.4-190531.udpipe',
    'en_gum': 'models/udpipe/2.4/english-gum-ud-2.4-190531.udpipe',
    'en_lines': 'models/udpipe/2.4/english-lines-ud-2.4-190531.udpipe',
    'en_partut': 'models/udpipe/2.4/english-partut-ud-2.4-190531.udpipe',
    'et_edt': 'models/udpipe/2.4/estonian-edt-ud-2.4-190531.udpipe',
    'et_ewt': 'models/udpipe/2.4/estonian-ewt-ud-2.4-190531.udpipe',
    'fi': 'models/udpipe/2.4/finnish-tdt-ud-2.4-190531.udpipe',
    'fi_tdt': 'models/udpipe/2.4/finnish-tdt-ud-2.4-190531.udpipe',
    'fi_ftb': 'models/udpipe/2.4/finnish-ftb-ud-2.4-190531.udpipe',
    'fr_gsd': 'models/udpipe/2.4/french-gsd-ud-2.4-190531.udpipe',
    'fr_partut': 'models/udpipe/2.4/french-partut-ud-2.4-190531.udpipe',
    'fr_sequoia': 'models/udpipe/2.4/french-sequoia-ud-2.4-190531.udpipe',
    'fr_spoken': 'models/udpipe/2.4/french-spoken-ud-2.4-190531.udpipe',
    'gl_ctg': 'models/udpipe/2.4/galician-ctg-ud-2.4-190531.udpipe',
    'gl_treegal': 'models/udpipe/2.4/galician-treegal-ud-2.4-190531.udpipe',
    'de': 'models/udpipe/2.4/german-gsd-ud-2.4-190531.udpipe',
    'got': 'models/udpipe/2.4/gothic-proiel-ud-2.4-190531.udpipe',
    'el': 'models/udpipe/2.4/greek-gdt-ud-2.4-190531.udpipe',
    'he': 'models/udpipe/2.4/hebrew-htb-ud-2.4-190531.udpipe',
    'hi': 'models/udpipe/2.4/hindi-hdtb-ud-2.4-190531.udpipe',
    'hu': 'models/udpipe/2.4/hungarian-szeged-ud-2.4-190531.udpipe',
    'id': 'models/udpipe/2.4/indonesian-gsd-ud-2.4-190531.udpipe',
    'ga': 'models/udpipe/2.4/irish-idt-ud-2.4-190531.udpipe',
    'it_isdt': 'models/udpipe/2.4/italian-isdt-ud-2.4-190531.udpipe',
    'it_partut': 'models/udpipe/2.4/italian-partut-ud-2.4-190531.udpipe',
    'it_postwita': 'models/udpipe/2.4/italian-postwita-ud-2.4-190531.udpipe',
    'it_vit': 'models/udpipe/2.4/italian-vit-ud-2.4-190531.udpipe',
    'ja': 'models/udpipe/2.4/japanese-gsd-ud-2.4-190531.udpipe',
    'ko_gsd': 'models/udpipe/2.4/korean-gsd-ud-2.4-190531.udpipe',
    'ko_kaist': 'models/udpipe/2.4/korean-kaist-ud-2.4-190531.udpipe',
    'la_ittb': 'models/udpipe/2.4/latin-ittb-ud-2.4-190531.udpipe',
    'la_perseus': 'models/udpipe/2.4/latin-perseus-ud-2.4-190531.udpipe',
    'la_proiel': 'models/udpipe/2.4/latin-proiel-ud-2.4-190531.udpipe',
    'lv': 'models/udpipe/2.4/latvian-lvtb-ud-2.4-190531.udpipe',
    'lt_alksnis': 'models/udpipe/2.4/lithuanian-alksnis-ud-2.4-190531.udpipe',
    'lt_hse': 'models/udpipe/2.4/lithuanian-hse-ud-2.4-190531.udpipe',
    'mt': 'models/udpipe/2.4/maltese-mudt-ud-2.4-190531.udpipe',
    'mr': 'models/udpipe/2.4/marathi-ufal-ud-2.4-190531.udpipe',
    'sme': 'models/udpipe/2.4/north_sami-giella-ud-2.4-190531.udpipe',
    'no_bokmaal': 'models/udpipe/2.4/norwegian-bokmaal-ud-2.4-190531.udpipe',
    'no_nynorsklia': 'models/udpipe/2.4/norwegian-nynorsklia-ud-2.4-190531.udpipe',
    'no_nynorsk': 'models/udpipe/2.4/norwegian-nynorsk-ud-2.4-190531.udpipe',
    'cu': 'models/udpipe/2.4/old_church_slavonic-proiel-ud-2.4-190531.udpipe',
    'fro': 'models/udpipe/2.4/old_french-srcmf-ud-2.4-190531.udpipe',
    'orv': 'models/udpipe/2.4/old_russian-torot-ud-2.4-190531.udpipe',
    'fa': 'models/udpipe/2.4/persian-seraji-ud-2.4-190531.udpipe',
    'pl_lfg': 'models/udpipe/2.4/polish-lfg-ud-2.4-190531.udpipe',
    'pl_pdb': 'models/udpipe/2.4/polish-pdb-ud-2.4-190531.udpipe',
    'pt_bosque': 'models/udpipe/2.4/portuguese-bosque-ud-2.4-190531.udpipe',
    'pt_gsd': 'models/udpipe/2.4/portuguese-gsd-ud-2.4-190531.udpipe',
    'ro_nonstandard': 'models/udpipe/2.4/romanian-nonstandard-ud-2.4-190531.udpipe',
    'ro_rrt': 'models/udpipe/2.4/romanian-rrt-ud-2.4-190531.udpipe',
    'ru_gsd': 'models/udpipe/2.4/russian-gsd-ud-2.4-190531.udpipe',
    'ru_syntagrus': 'models/udpipe/2.4/russian-syntagrus-ud-2.4-190531.udpipe',
    'ru_taiga': 'models/udpipe/2.4/russian-taiga-ud-2.4-190531.udpipe',
    'sr': 'models/udpipe/2.4/serbian-set-ud-2.4-190531.udpipe',
    'sk': 'models/udpipe/2.4/slovak-snk-ud-2.4-190531.udpipe',
    'sl_ssj': 'models/udpipe/2.4/slovenian-ssj-ud-2.4-190531.udpipe',
    'sl_sst': 'models/udpipe/2.4/slovenian-sst-ud-2.4-190531.udpipe',
    'es_ancora': 'models/udpipe/2.4/spanish-ancora-ud-2.4-190531.udpipe',
    'es_gsd': 'models/udpipe/2.4/spanish-gsd-ud-2.4-190531.udpipe',
    'sv_lines': 'models/udpipe/2.4/swedish-lines-ud-2.4-190531.udpipe',
    'sv_talbanken': 'models/udpipe/2.4/swedish-talbanken-ud-2.4-190531.udpipe',
    'ta': 'models/udpipe/2.4/tamil-ttb-ud-2.4-190531.udpipe',
    'te': 'models/udpipe/2.4/telugu-mtg-ud-2.4-190531.udpipe',
    'tr': 'models/udpipe/2.4/turkish-imst-ud-2.4-190531.udpipe',
    'uk': 'models/udpipe/2.4/ukrainian-iu-ud-2.4-190531.udpipe',
    'ur': 'models/udpipe/2.4/urdu-udtb-ud-2.4-190531.udpipe',
    'ug': 'models/udpipe/2.4/uyghur-udt-ud-2.4-190531.udpipe',
    'vi': 'models/udpipe/2.4/vietnamese-vtb-ud-2.4-190531.udpipe',
    'wo': 'models/udpipe/2.4/wolof-wtb-ud-2.4-190531.udpipe',
}


[docs] class Base(Block): """Base class for all UDPipe blocks.""" # pylint: disable=too-many-arguments def __init__(self, model=None, model_alias=None, online=False, tokenize=True, tag=True, parse=True, resegment=False, ranges=False, delete_nodes=False, **kwargs): super().__init__(**kwargs) self.model, self.model_alias, self.online = model, model_alias, online self._tool = None self.tokenize, self.tag, self.parse, self.resegment = tokenize, tag, parse, resegment self.ranges, self.delete_nodes = ranges, delete_nodes @property def tool(self): """Return the tool (UDPipe in this case), created lazily.""" if self._tool: return self._tool if not self.model: if not self.model_alias: raise ValueError('model (path/to/model) or model_alias (e.g. en) must be set!') if self.online: self.model = self.model_alias else: self.model = KNOWN_MODELS[self.model_alias] if self.online: self._tool = UDPipeOnline(model=self.model) else: self._tool = UDPipe(model=self.model) return self._tool
[docs] def process_document(self, doc): tok, tag, par, reseg, ranges = self.tokenize, self.tag, self.parse, self.resegment, self.ranges if self.zones == "all" and self.online: self.tool.process_document(doc, tok, tag, par, reseg, ranges) return old_bundles = doc.bundles new_bundles = [] for bundle in old_bundles: for tree in bundle: new_bundles.append(bundle) if self._should_process_tree(tree): if self.delete_nodes: for subroot in tree.children: subroot.remove() if tok: new_trees = self.tool.tokenize_tag_parse_tree(tree, resegment=reseg, tag=tag, parse=par, ranges=ranges) if self.resegment and len(new_trees) > 1: orig_bundle_id = bundle.bundle_id bundle.bundle_id = orig_bundle_id + '-1' for i, new_tree in enumerate(new_trees[1:], 2): new_bundle = Bundle(document=doc, bundle_id=f"{orig_bundle_id}-{i}") new_tree.zone = tree.zone new_bundle.add_tree(new_tree) new_bundles.append(new_bundle) elif not tok and not reseg and (tag or par): self.tool.tag_parse_tree(tree, tag=tag, parse=par) elif not tok and reseg and not tag and not par: sentences = self.tool.segment_text(tree.text) if len(sentences) > 1: orig_bundle_id = bundle.bundle_id bundle.bundle_id = orig_bundle_id + '-1' tree.text = sentences[0] for i, sentence in enumerate(sentences[1:], 2): new_bundle = Bundle(document=doc, bundle_id=f"{orig_bundle_id}-{i}") new_tree = new_bundle.create_tree(zone=tree.zone) new_tree.text = sentence new_bundles.append(new_bundle) else: raise ValueError(f"Unimplemented tokenize={tok} tag={tag} parse={par} resegment={reseg}") doc.bundles = new_bundles
''' Udapi::Block::UDPipe::Base - tokenize, tag and parse into UD =head1 SYNOPSIS # from the command line echo John loves Mary | udapi.pl Read::Sentences UDPipe::Base model_alias=en Write::TextModeTrees # in scenario UDPipe::Base model=/home/me/english-ud-1.2-160523.udpipe UDPipe::Base model_alias=en UDPipe::EN # shortcut for the above UDPipe::EN tokenize=1 tag=1 parse=0 =head1 DESCRIPTION This block loads L<Udapi::Tool::UDPipe> (a wrapper for the UDPipe C++ tool) with the given C<model> for analysis into the Universal Dependencies (UD) style. UDPipe can do tokenization, tagging (plus lemmatization and universal features) and parsing (with deprel labels) and users of this block can select which of the substasks should be done using parameters C<tokenize>, C<tag> and C<parse>. The default is to do all three. =head1 TODO UDPipe can do also sentence segmentation, but L<Udapi::Tool::UDPipe> does not supported it yet. Similarly with multi-word tokens. =head1 PARAMETERS =head2 C<model> Path to the model file within Udapi share (or relative path starting with "./" or absolute path starting with "/"). This parameter is required if C<model_alias> is not supplied. =head2 C<model_alias> The C<model> parameter can be omitted if this parameter is supplied. Currently available model aliases are: B<grc_proiel, grc, ar, eu, bg, hr, cs, da, nl, en, et, fi, fi_ftb, fr, got, de, el, he, hi, hu, id, ga, it, la_itt, la_proiel, la, no, cu, fa, po, ro, pt, sl, es, ta, sv>. They correspond to paths where the language code in the alias is substituted with the respective language name, e.g. B<grc_proiel> expands to C<models/udpipe/ancient-greek-ud-1.2-160523.udpipe>. =head1 tokenize Do tokenization, i.e. create new nodes with attributes C<form>, C<misc> (if SpaceAfter=No) and C<ord>. The sentence string is taken from the root's attribute C<text>. =head1 tag Fill node attributes: C<lemma>, C<upos>, C<xpos> and C<feats>. On the input, just the attribute C<form> is expected. =head1 parse Fill node attributes: C<deprel> and rehang the nodes to their parent. On the input, attributes C<lemma>, C<upos>, C<xpos> and C<feats> are expected. =head1 SEE ALSO L<http://ufal.mff.cuni.cz/udpipe> L<Udapi::Tool::UDPipe> '''