""""Conll is a reader block for CoNLL-like files (CoNLL-U, CoNLL-X, CoNLL-2009)."""
import json
import logging
import re
import udapi.block.read.conllu
from udapi.core.root import Root
from udapi.core.node import Node
[docs]
class Conll(udapi.block.read.conllu.Conllu):
"""A reader of the CoNLL-U files."""
def __init__(self, separator='tab',
attributes='ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc', **kwargs):
"""Create the Conll reader object.
This us a subclass of udapi.block.read.conllu.Conllu,
which adds a support for arbitrary column names and thus supporting not only CoNLL-U,
but also CoNLL-X, CoNLL-2009 and many other CoNLL-like formats.
Args:
separator: How are the columns separated?
Default='tab' is the only possibility in valid CoNLL-U files.
'space' means one or more whitespaces (this does not allow forms with space).
'doublespace' means two or more spaces.
attributes: comma-separated list of column names in the input files
(default='ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc')
Changing the default can be used for loading CoNLL-like formats (not valid CoNLL-U).
For ignoring a column, use "_" as its name.
Column "ord" marks the column with 1-based word-order number/index (usualy called ID).
Column "head" marks the column with dependency parent index (word-order number).
For example, for CoNLL-X which uses name1=value1|name2=value2 format of FEATS, use
`attributes=ord,form,lemma,upos,xpos,feats,head,deprel,_,_`
but note that attributes upos, feats and deprel will contain language-specific values,
not valid according to UD guidelines and a further conversion will be needed.
You will loose the projective_HEAD and projective_DEPREL attributes.
For CoNLL-2009 you can use `attributes=ord,form,lemma,_,upos,_,feats,_,head,_,deprel`.
You will loose the predicted_* attributes and semantic/predicate annotation.
TODO: allow storing the rest of columns in misc, e.g. `node.misc[feats]`
for feats which do not use the name1=value1|name2=value2 format.
"""
super().__init__(**kwargs)
self.node_attributes = attributes.split(',')
self.separator = separator
# pylint: disable=too-many-locals,too-many-branches,too-many-statements
# Maybe the code could be refactored, but it is speed-critical,
# so benchmarking is needed because calling extra methods may result in slowdown.
[docs]
def parse_node_line(self, line, root, nodes, parents, mwts):
if self.separator == 'tab':
fields = line.split('\t')
elif self.separator == 'space':
fields = line.split()
elif self.separator == 'doublespace':
fields = re.split(' +', line)
else:
raise ValueError('separator=%s is not valid' % self.separator)
if len(fields) != len(self.node_attributes):
if self.strict:
raise RuntimeError('Wrong number of columns in %r' % line)
fields.extend(['_'] * (len(self.node_attributes) - len(fields)))
# multi-word tokens will be processed later
if '-' in fields[0]:
mwts.append(fields)
return
if '.' in fields[0]:
empty = root.create_empty_child(form=fields[1], lemma=fields[2], upos=fields[3],
xpos=fields[4], feats=fields[5], misc=fields[9])
empty.ord = fields[0]
empty.raw_deps = fields[8] # TODO
return
# This implementation is slower than in read.Conllu,
# but it allows for arbitrary columns
node = root.create_child()
for (n_attribute, attribute_name) in enumerate(self.node_attributes):
if attribute_name == 'head':
try:
parents.append(int(fields[n_attribute]))
except ValueError as exception:
if not self.strict and fields[n_attribute] == '_':
if self.empty_parent == 'warn':
logging.warning("Empty parent/head index in '%s'", line)
parents.append(0)
else:
raise exception
elif attribute_name == 'ord':
setattr(node, 'ord', int(fields[n_attribute]))
elif attribute_name == 'deps':
setattr(node, 'raw_deps', fields[n_attribute])
elif attribute_name != '_' and fields[n_attribute] != '_':
setattr(node, attribute_name, fields[n_attribute])
nodes.append(node)
# Acknowledged code duplication with read.Conllu
[docs]
def read_tree_from_lines(self, lines):
root = Root()
nodes = [root]
parents = [0]
mwts = []
for line in lines:
if line[0] == '#':
self.parse_comment_line(line, root)
else:
self.parse_node_line(line, root, nodes, parents, mwts)
# If no nodes were read from the filehandle (so only root remained in nodes),
# we return None as a sign of failure (end of file or more than one empty line).
if len(nodes) == 1:
return None
# Empty sentences are not allowed in CoNLL-U,
# but if the users want to save just the sentence string and/or sent_id
# they need to create one artificial node and mark it with Empty=Yes.
# In that case, we will delete this node, so the tree will have just the (technical) root.
# See also udapi.block.write.Conllu, which is compatible with this trick.
if len(nodes) == 2 and str(nodes[1].misc) == 'Empty=Yes':
nodes.pop()
root._children = []
root._descendants = []
# Set dependency parents (now, all nodes of the tree are created).
for node_ord, node in enumerate(nodes[1:], 1):
try:
parent = nodes[parents[node_ord]]
except IndexError:
raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord]))
if node is parent:
if self.fix_cycles:
logging.warning("Ignoring a cycle (attaching to the root instead):\n%s", node)
node._parent = root
root._children.append(node)
else:
raise ValueError(f"Detected a cycle: {node} attached to itself")
elif node.children:
climbing = parent._parent
while climbing:
if climbing is node:
if self.fix_cycles:
logging.warning("Ignoring a cycle (attaching to the root instead):\n%s", parent)
parent = root
break
else:
raise ValueError(f"Detected a cycle: {node}")
climbing = climbing._parent
node._parent = parent
parent._children.append(node)
# Create multi-word tokens.
for fields in mwts:
range_start, range_end = fields[0].split('-')
words = nodes[int(range_start):int(range_end) + 1]
root.create_multiword_token(words, form=fields[1], misc=fields[-1])
return root