Source code for udapi.block.util.split

"""util.Split is a special block for splitting documents."""
import math
from udapi.core.basereader import BaseReader

# pylint: disable=abstract-method
# read_tree() does not need to be installed here


[docs] class Split(BaseReader): """Split Udapi document (with sentence-aligned trees in bundles) into several parts.""" def __init__(self, parts=None, bundles_per_doc=None, **kwargs): """Args: parts: into how many parts should the document be split bundles_per_doc: number of bundles per the newly created part """ super().__init__(**kwargs) if parts is None and bundles_per_doc is None: raise ValueError('parts or bundles_per_doc must be specified') if parts is not None and bundles_per_doc is not None: raise ValueError('Cannot specify both parts and bundles_per_doc') self.parts = parts self.bundles_per_doc = bundles_per_doc self.buffer = None
[docs] @staticmethod def is_multizone_reader(): return False
[docs] def process_document(self, document): if not self.buffer: self.buffer = document.bundles document.bundles = [] if self.bundles_per_doc is None: self.bundles_per_doc = math.ceil(len(self.buffer) / self.parts) self.buffer.extend(document.bundles) document.bundles = self.buffer[:self.bundles_per_doc] self.buffer = self.buffer[self.bundles_per_doc:] self.finished = not self.buffer