Source code for udapi.block.read.vislcg

"""Vislcg is a reader block the VISL-cg format."""
from udapi.core.basereader import BaseReader
from udapi.core.root import Root


[docs] class Vislcg(BaseReader): """A reader of the VISL-cg format, suitable for VISL Constraint Grammer Parser.""" # TODO check validity and raise helpful exceptions if not valid # pylint: disable=too-many-branches
[docs] def read_tree(self): if self.filehandle is None: return None root = Root() parents = [0] words = [] form = None for line in self.filehandle: line = line.rstrip() if line == '': break if line[0] == '#': root.comment += line[1:] + "\n" continue if line[0].isspace(): node, parent_ord = self._node(line.lstrip(), root) words.append(node) parents.append(parent_ord) continue if words: words[0].form = form if len(words) > 1: split_forms = form.split() if len(words) == len(split_forms): for word, split_form in zip(words, split_forms): word.form = split_form else: for word in words[1:]: word.form = '_' root.create_multiword_token(words, form=form) words = [] form = line[2:-2] if words: words[0].form = form for word in words[1:]: word.form = '_' nodes = root.descendants(add_self=True) if len(nodes) == 1: return None for node_ord, node in enumerate(nodes[1:], 1): try: node.parent = nodes[parents[node_ord]] except IndexError: raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord])) return root
@staticmethod def _node(line, root): # line contains "lemma" xpos feat1 feat2 .. featN @deprel #ord->parent.ord # Lemma can contain spaces, but quotes within lemma are not escaped, # so we cannot use fields = shlex.split(line) # Let's hope that xpos, feats and deprel do not contain any quotes. end_quote_pos = line.rfind('"') lemma = line[1:end_quote_pos] fields = line[end_quote_pos + 1:].split() xpos = fields[0] feats_list = fields[3:-2] feats = '|'.join(feats_list) if feats_list else '_' deprel = fields[-2][1:] parent_ord = int(fields[-1].split('->')[1]) node = root.create_child(lemma=lemma, xpos=xpos, feats=feats, deprel=deprel) return node, parent_ord