Source code for udapi.block.ud.settranslation

Block SetTranslation for setting of sentence-level translation (the attribute
text_en for English translation) from a separate text file (one sentence per
line). For example, one can export the original sentences using write.SentencesHtml,
then Google-translate them in the web browser, then CTRL+C CTRL+V to a plain
text editor, save them as translations.txt and import them using this block.

udapy -s ud.SetTranslation file=translations.txt < in.conllu > out.conllu

Author: Dan Zeman
from udapi.core.block import Block
import re
import logging

[docs] class SetTranslation(Block): """ Set text_en to the next available translation. """ def __init__(self, file, overwrite=False, **kwargs): """ Create the SetTranslation block. Parameters: file: the name of the text file with the translations (one sentence per line) overwrite=1: set the translation even if the sentence already has one (default: do not overwrite existing translations) """ super().__init__(**kwargs) self.file = file fh = open(self.file, 'r', encoding='utf-8') self.trlines = fh.readlines() self.nlines = len(self.trlines) self.iline = 0 self.overwrite = overwrite
[docs] def process_tree(self, tree): if self.iline < self.nlines: translation = self.trlines[self.iline] self.iline += 1 comments = [] if tree.comment: comments = tree.comment.split('\n') i_tr = -1 for i in range(len(comments)): # The initial '#' character has been stripped. if re.match(r'\s*text_en\s*=', comments[i]): i_tr = i break if i_tr >= 0: if self.overwrite: comments[i_tr] = ' text_en = ' + translation else: comments.append(' text_en = ' + translation) tree.comment = '\n'.join(comments) elif self.iline == self.nlines: logging.warning('There are only %d translation lines but there are more input sentences.' % self.nlines)