Source code for udapi.block.read.conll

""""Conll is a reader block for CoNLL-like files (CoNLL-U, CoNLL-X, CoNLL-2009)."""
import json
import logging
import re

import udapi.block.read.conllu
from udapi.core.root import Root
from udapi.core.node import Node


[docs] class Conll(udapi.block.read.conllu.Conllu): """A reader of the CoNLL-U files.""" def __init__(self, separator='tab', attributes='ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc', **kwargs): """Create the Conll reader object. This us a subclass of udapi.block.read.conllu.Conllu, which adds a support for arbitrary column names and thus supporting not only CoNLL-U, but also CoNLL-X, CoNLL-2009 and many other CoNLL-like formats. Args: separator: How are the columns separated? Default='tab' is the only possibility in valid CoNLL-U files. 'space' means one or more whitespaces (this does not allow forms with space). 'doublespace' means two or more spaces. attributes: comma-separated list of column names in the input files (default='ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc') Changing the default can be used for loading CoNLL-like formats (not valid CoNLL-U). For ignoring a column, use "_" as its name. Column "ord" marks the column with 1-based word-order number/index (usualy called ID). Column "head" marks the column with dependency parent index (word-order number). For example, for CoNLL-X which uses name1=value1|name2=value2 format of FEATS, use `attributes=ord,form,lemma,upos,xpos,feats,head,deprel,_,_` but note that attributes upos, feats and deprel will contain language-specific values, not valid according to UD guidelines and a further conversion will be needed. You will loose the projective_HEAD and projective_DEPREL attributes. For CoNLL-2009 you can use `attributes=ord,form,lemma,_,upos,_,feats,_,head,_,deprel`. You will loose the predicted_* attributes and semantic/predicate annotation. TODO: allow storing the rest of columns in misc, e.g. `node.misc[feats]` for feats which do not use the name1=value1|name2=value2 format. """ super().__init__(**kwargs) self.node_attributes = attributes.split(',') self.separator = separator # pylint: disable=too-many-locals,too-many-branches,too-many-statements # Maybe the code could be refactored, but it is speed-critical, # so benchmarking is needed because calling extra methods may result in slowdown.
[docs] def parse_node_line(self, line, root, nodes, parents, mwts): if self.separator == 'tab': fields = line.split('\t') elif self.separator == 'space': fields = line.split() elif self.separator == 'doublespace': fields = re.split(' +', line) else: raise ValueError('separator=%s is not valid' % self.separator) if len(fields) != len(self.node_attributes): if self.strict: raise RuntimeError('Wrong number of columns in %r' % line) fields.extend(['_'] * (len(self.node_attributes) - len(fields))) # multi-word tokens will be processed later if '-' in fields[0]: mwts.append(fields) return if '.' in fields[0]: empty = root.create_empty_child(form=fields[1], lemma=fields[2], upos=fields[3], xpos=fields[4], feats=fields[5], misc=fields[9]) empty.ord = fields[0] empty.raw_deps = fields[8] # TODO return # This implementation is slower than in read.Conllu, # but it allows for arbitrary columns node = root.create_child() for (n_attribute, attribute_name) in enumerate(self.node_attributes): if attribute_name == 'head': try: parents.append(int(fields[n_attribute])) except ValueError as exception: if not self.strict and fields[n_attribute] == '_': if self.empty_parent == 'warn': logging.warning("Empty parent/head index in '%s'", line) parents.append(0) else: raise exception elif attribute_name == 'ord': setattr(node, 'ord', int(fields[n_attribute])) elif attribute_name == 'deps': setattr(node, 'raw_deps', fields[n_attribute]) elif attribute_name != '_' and fields[n_attribute] != '_': setattr(node, attribute_name, fields[n_attribute]) nodes.append(node)
# Acknowledged code duplication with read.Conllu
[docs] def read_tree_from_lines(self, lines): root = Root() nodes = [root] parents = [0] mwts = [] for line in lines: if line[0] == '#': self.parse_comment_line(line, root) else: self.parse_node_line(line, root, nodes, parents, mwts) # If no nodes were read from the filehandle (so only root remained in nodes), # we return None as a sign of failure (end of file or more than one empty line). if len(nodes) == 1: return None # Empty sentences are not allowed in CoNLL-U, # but if the users want to save just the sentence string and/or sent_id # they need to create one artificial node and mark it with Empty=Yes. # In that case, we will delete this node, so the tree will have just the (technical) root. # See also udapi.block.write.Conllu, which is compatible with this trick. if len(nodes) == 2 and str(nodes[1].misc) == 'Empty=Yes': nodes.pop() root._children = [] root._descendants = [] # Set dependency parents (now, all nodes of the tree are created). for node_ord, node in enumerate(nodes[1:], 1): try: parent = nodes[parents[node_ord]] except IndexError: raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord])) if node is parent: if self.fix_cycles: logging.warning("Ignoring a cycle (attaching to the root instead):\n%s", node) node._parent = root root._children.append(node) else: raise ValueError(f"Detected a cycle: {node} attached to itself") elif node.children: climbing = parent._parent while climbing: if climbing is node: if self.fix_cycles: logging.warning("Ignoring a cycle (attaching to the root instead):\n%s", parent) parent = root break else: raise ValueError(f"Detected a cycle: {node}") climbing = climbing._parent node._parent = parent parent._children.append(node) # Create multi-word tokens. for fields in mwts: range_start, range_end = fields[0].split('-') words = nodes[int(range_start):int(range_end) + 1] root.create_multiword_token(words, form=fields[1], misc=fields[-1]) return root