Source code for udapi.block.write.conllu

"""Conllu class is a a writer of files in the CoNLL-U format."""
import json
from udapi.core.basewriter import BaseWriter

[docs] class Conllu(BaseWriter): """A writer of files in the CoNLL-U format.""" def __init__(self, print_sent_id=True, print_text=True, print_empty_trees=True, **kwargs): super().__init__(**kwargs) self.print_sent_id = print_sent_id self.print_text = print_text self.print_empty_trees = print_empty_trees
[docs] def process_tree(self, tree): # pylint: disable=too-many-branches empty_nodes = tree.empty_nodes if empty_nodes: nodes = sorted(tree._descendants + empty_nodes) else: nodes = tree._descendants # Empty sentences are not allowed in CoNLL-U, so with print_empty_trees==0 # we need to skip the whole tree (including possible comments). if not nodes and not self.print_empty_trees: return # If tree.comment contains placeholders $NEWDOC,...$TEXT, replace them with the actual # value of the attribute and make note on which line (i_*) they were present. comment_lines = tree.comment.splitlines() i_newdoc, i_newpar, i_sent_id, i_text, i_global_entity = -1, -1, -1, -1, -1 for i, c_line in enumerate(comment_lines): if c_line == '$SENT_ID': i_sent_id = i comment_lines[i] = ' sent_id = ' + tree.sent_id if self.print_sent_id else None elif c_line == '$TEXT': i_text = i if self.print_text: if tree.text is None: comment_lines[i] = ' text = ' + tree.compute_text() else: comment_lines[i] = ' text = ' + tree.text.replace('\n', '').replace('\r', '').rstrip() elif c_line == '$NEWDOC': i_newdoc = i if self.print_sent_id and tree.newdoc: comment_lines[i] = ' newdoc' + (' id = ' + tree.newdoc if tree.newdoc is not True else '') else: comment_lines[i] = None elif c_line == '$NEWPAR': i_newpar = i if self.print_sent_id and tree.newpar: comment_lines[i] = ' newpar' + (' id = ' + tree.newpar if tree.newpar is not True else '') else: comment_lines[i] = None elif c_line == '$GLOBAL.ENTITY': i_global_entity = i ge = tree.document.meta.get('global.Entity') if ge: comment_lines[i] = ' global.Entity = ' + ge else: comment_lines[i] = None # Now print the special comments: global.columns, newdoc, newpar, sent_id and text. # If these comments were already present in tree.comment (as marked with the placeholders), # keep them at their original position and print also all comment lines preceding them. # It they were missing, try to print them at the correct position. printed_i = -1 if comment_lines and comment_lines[0].startswith(' global.columns'): printed_i += 1 print('#' + comment_lines[printed_i]) if self.print_sent_id: if tree.newdoc: if i_newdoc == -1: print('# newdoc' + (' id = ' + tree.newdoc if tree.newdoc is not True else '')) else: while printed_i < i_newdoc: printed_i += 1 if comment_lines[printed_i]: print('#' + comment_lines[printed_i]) ge = tree.document.meta.get('global.Entity') if ge: if i_global_entity == -1: print('# global.Entity = ' + ge) else: while printed_i < i_global_entity: printed_i += 1 if comment_lines[printed_i]: print('#' + comment_lines[printed_i]) if tree.newpar: if i_newpar == -1: print('# newpar' + (' id = ' + tree.newpar if tree.newpar is not True else '')) else: while printed_i < i_newpar: printed_i += 1 if comment_lines[printed_i]: print('#' + comment_lines[printed_i]) if i_sent_id == -1: print('# sent_id = ' + tree.sent_id) else: while printed_i < i_sent_id: printed_i += 1 if comment_lines[printed_i]: print('#' + comment_lines[printed_i]) if self.print_text and i_text == -1: print('# text = ' + (tree.compute_text() if tree.text is None else tree.text.replace('\n', '').replace('\r', '').rstrip())) for c_line in comment_lines[printed_i + 1:]: if c_line: print('#' + c_line) # Special-purpose json_* comments should always be at the end of the comment block. if tree.json: for key, value in sorted(tree.json.items()): print(f"# json_{key} = {json.dumps(value, ensure_ascii=False, sort_keys=True)}") last_mwt_id = 0 for node in nodes: mwt = node._mwt if mwt and node._ord > last_mwt_id: print('\t'.join((mwt.ord_range, '_' if mwt.form is None else mwt.form, '_\t_\t_\t_\t_\t_\t_', '_' if mwt._misc is None else str(mwt.misc)))) last_mwt_id = mwt.words[-1]._ord if node._parent is None: head = '_' # Empty nodes else: try: head = str(node._parent._ord) except AttributeError: head = '0' print('\t'.join('_' if v is None else v for v in (str(node._ord), node.form, node.lemma, node.upos, node.xpos, '_' if node._feats is None else str(node.feats), head, node.deprel, node.raw_deps, '_' if node._misc is None else str(node.misc)))) # Empty sentences are not allowed in CoNLL-U, # but with print_empty_trees==1 (which is the default), # we will print an artificial node, so we can print the comments. if not nodes: print("1\t_\t_\t_\t_\t_\t0\t_\t_\tEmpty=Yes") # Empty line separates trees in CoNLL-U (and is required after the last tree as well) print("")
[docs] def before_process_document(self, document): """Print doc_json_* headers.""" super().before_process_document(document) if document.json: for key, value in sorted(document.json.items()): print("# doc_json_%s = %s" % (key, json.dumps(value, ensure_ascii=False, sort_keys=True)))