Source code for udapi.block.read.ccv

"""Ccv class is a reader for Corpus of Czech Verse json files."""
from udapi.core.basereader import BaseReader
from udapi.core.root import Root
from udapi.block.ud.setspaceafterfromtext import SetSpaceAfterFromText
import json

[docs] class Ccv(BaseReader): r"""A reader for Corpus of Czech Verse json files. See https://github.com/versotym/corpusCzechVerse Each verse (line) is stored as one tree (although it is quite often not a whole sentence). Start of each stanza is marked with `newpar`. Start of each poem is marked with `newdoc = [poem_id]`. Args: tokenize: create nodes """ def __init__(self, tokenize=True, **kwargs): self.tokenize = tokenize self._cache = None super().__init__(**kwargs)
[docs] @staticmethod def is_multizone_reader(): """Can this reader read bundles which contain more zones?. This implementation returns always False. """ return False
[docs] def read_tree(self): if self._cache: return self._cache.pop() else: trees = self.read_trees() if not trees: return None self._cache = list(reversed(trees[1:])) return trees[0]
[docs] def read_trees(self): if self.filehandle is None: return None poems = json.load(self.filehandle) all_trees = [] for poem in poems: poem_trees = [] for stanza in poem["body"]: stanza_trees = [] for line in stanza: root = Root() root.text = line["text"] root.json["rhyme"] = line["rhyme"] root.json["metre"] = line["metre"] root.json["stress"] = line["stress"] stanza_trees.append(root) if self.tokenize: words = [[]] + [[w] for w in line["words"]] for index, puncts in line["punct"].items(): for punct in puncts: words[int(index)].append({"token": punct, "lemma": punct}) for word in words: for w in word: node = root.create_child(form=w["token"], lemma=w["lemma"]) if "morph" in w: node.xpos = w["morph"] node.misc["xsampa"] = w["xsampa"] node.misc["phoebe"] = w["phoebe"] SetSpaceAfterFromText.process_tree(None, root) stanza_trees[0].newpar = True poem_trees.extend(stanza_trees) root = poem_trees[0] root.newdoc = poem["poem_id"] root.json["p_author"] = poem["p_author"] root.json["b_author"] = poem["b_author"] root.json["biblio"] = poem["biblio"] all_trees.extend(poem_trees) return all_trees