"""Wrapper for UDPipe online web service."""
import io
import sys
import email.mime.multipart
import email.mime.nonmultipart
import email.policy
import json
import os
import sys
import urllib.error
import urllib.request
from udapi.block.read.conllu import Conllu as ConlluReader
from udapi.core.root import Root
[docs]
class UDPipeOnline:
"""Wrapper for UDPipe online web service."""
def __init__(self, model, server="https://lindat.mff.cuni.cz/services/udpipe/api"):
"""Create the UDPipeOnline tool object."""
self.model = model
self.server = server
[docs]
def list_models(self):
with urllib.request.urlopen(self.server + "/models") as request:
response = json.loads(request.read())
return list(response["models"].keys())
[docs]
def tag_parse_tree(self, root, tag=True, parse=True):
"""Tag (+lemmatize, fill FEATS) and parse a tree (already tokenized)."""
if not tag and not parse:
raise ValueError('tag_parse_tree(root, tag=False, parse=False) does not make sense.')
descendants = root.descendants
if not descendants:
return
in_data = " ".join([n.form for n in descendants])
params = {"model": self.model, "data": in_data, "input":"horizontal", "tagger":""}
if tag:
attrs = 'upos xpos lemma feats'.split()
else:
attrs = []
if parse:
params["parser"] = ""
attrs.append('deprel')
out_data = self.perform_request(params=params)
conllu_reader = ConlluReader(empty_parent="ignore")
conllu_reader.files.filehandle = io.StringIO(out_data)
parsed_root = conllu_reader.read_tree()
if parse:
root.flatten()
for parsed_node in parsed_root.descendants:
node = descendants[parsed_node.ord - 1]
if parse:
node.parent = descendants[parsed_node.parent.ord - 1] if parsed_node.parent.ord else root
for attr in attrs:
setattr(node, attr, getattr(parsed_node, attr))
[docs]
def tokenize_tag_parse_tree(self, root, resegment=False, tag=True, parse=True):
"""Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`.
If resegment=True, the returned list of Udapi trees may contain multiple trees.
"""
if parse and not tag:
raise ValueError('Combination parse=True tag=False is not allowed.')
if root.children:
raise ValueError('Tree already contained nodes before tokenization')
# Tokenize and possibly segment the input text
params = {"model": self.model, "data": root.text, "tokenizer":"" if resegment else "presegmented"}
if tag:
params["tagger"] = ""
if parse:
params["parser"] = ""
out_data = self.perform_request(params=params)
conllu_reader = ConlluReader(empty_parent="ignore")
conllu_reader.files.filehandle = io.StringIO(out_data)
trees = conllu_reader.read_trees()
# The input "root" object must be the first item in "trees".
for attr in ('_children', '_descendants', '_mwts', 'text', 'comment'):
setattr(root, attr, getattr(trees[0], attr))
for node in root._children:
node._parent = root
for node in root._descendants:
node._root = root
trees[0] = root
return trees
[docs]
def segment_text(self, text):
"""Segment the provided text into sentences returned as a Python list."""
params = {"model": self.model, "data": text, "tokenizer":"", "output": "plaintext=normalized_spaces"}
return self.perform_request(params=params).rstrip().split("\n")