Source code for udapi.block.read.ccv

"""Ccv class is a reader for Corpus of Czech Verse json files."""
from udapi.core.basereader import BaseReader
from udapi.core.root import Root
from udapi.block.ud.setspaceafterfromtext import SetSpaceAfterFromText
import json


[docs]
class Ccv(BaseReader):
    r"""A reader for Corpus of Czech Verse json files.

    See https://github.com/versotym/corpusCzechVerse
    Each verse (line) is stored as one tree (although it is quite often not a whole sentence).
    Start of each stanza is marked with `newpar`.
    Start of each poem is marked with `newdoc = [poem_id]`.

    Args:
    tokenize: create nodes
    """
    def __init__(self, tokenize=True, **kwargs):
        self.tokenize = tokenize
        self._cache = None
        super().__init__(**kwargs)


[docs]
    @staticmethod
    def is_multizone_reader():
        """Can this reader read bundles which contain more zones?.

        This implementation returns always False.
        """
        return False



[docs]
    def read_tree(self):
        if self._cache:
            return self._cache.pop()
        else:
            trees = self.read_trees()
            if not trees:
                return None
            self._cache = list(reversed(trees[1:]))
            return trees[0]



[docs]
    def read_trees(self):
        if self.filehandle is None:
            return None
        poems = json.load(self.filehandle)
        all_trees = []
        for poem in poems:
            poem_trees = []
            for stanza in poem["body"]:
                stanza_trees = []
                for line in stanza:
                    root = Root()
                    root.text = line["text"]
                    root.json["rhyme"] = line["rhyme"]
                    root.json["metre"] = line["metre"]
                    root.json["stress"] = line["stress"]
                    stanza_trees.append(root)
                    if self.tokenize:
                        words = [[]] + [[w] for w in line["words"]]
                        for index, puncts in line["punct"].items():
                            for punct in puncts:
                                words[int(index)].append({"token": punct, "lemma": punct})
                        for word in words:
                            for w in word:
                                node = root.create_child(form=w["token"], lemma=w["lemma"])
                                if "morph" in w:
                                    node.xpos = w["morph"]
                                    node.misc["xsampa"] = w["xsampa"]
                                    node.misc["phoebe"] = w["phoebe"]
                        SetSpaceAfterFromText.process_tree(None, root)
                stanza_trees[0].newpar = True
                poem_trees.extend(stanza_trees)
            root = poem_trees[0]
            root.newdoc = poem["poem_id"]
            root.json["p_author"] = poem["p_author"]
            root.json["b_author"] = poem["b_author"]
            root.json["biblio"] = poem["biblio"] 
            all_trees.extend(poem_trees)
        return all_trees