Source code for udapi.block.read.sentences

"""Sentences class is a reader for plain-text sentences."""
from udapi.core.basereader import BaseReader
from udapi.core.root import Root



[docs]
class Sentences(BaseReader):
    r"""A reader for plain-text sentences (one sentence per line) files.

    Args:
    ignore_empty_lines: if True, delete empty lines from the input.
        Default=False.
    newdoc_if_empty_line: if True, empty lines mark document boundaries,
        which are marked with `root.newdoc`. Default=False.
    rstrip: a set of characters to be stripped from the end of each line.
        Default='\r\n '. You can use rstrip='\n' if you want to preserve
        any space or '\r' (Carriage Return) at end of line,
        so that `udpipe.Base` keeps these characters in `SpacesAfter`.
        As most blocks do not expect whitespace other than a space to appear
        in the processed text, using this feature is at your own risk.
    """
    def __init__(self, ignore_empty_lines=False, newdoc_if_empty_line=False,
                 rstrip='\r\n ', **kwargs):
        if ignore_empty_lines and newdoc_if_empty_line:
            raise ValueError("ignore_empty_lines is not compatible with newdoc_if_empty_line")
        self.ignore_empty_lines = ignore_empty_lines
        self.newdoc_if_empty_line = newdoc_if_empty_line
        self.rstrip = rstrip
        super().__init__(**kwargs)


[docs]
    @staticmethod
    def is_multizone_reader():
        """Can this reader read bundles which contain more zones?.

        This implementation returns always False.
        """
        return False



[docs]
    def read_tree(self, document=None):
        if self.filehandle is None:
            return None
        line = self.filehandle.readline()
        # if readline() returns an empty string, the end of the file has been
        # reached, while a blank line is represented by '\n'
        # (or '\r\n' if reading a Windows file on Unix machine).
        if line == '':
            return None
        preceded_by_empty_line = False
        if self.ignore_empty_lines or self.newdoc_if_empty_line:
            while line in {'\n', '\r\n'}:
                preceded_by_empty_line = True
                line = self.filehandle.readline()
                if line == '':
                    return None
        root = Root()
        root.text = line.rstrip(self.rstrip)
        if self.newdoc_if_empty_line and preceded_by_empty_line:
            root.newdoc = True
        return root


    # The first line in a file also marks a start of new document

[docs]
    def after_process_document(self, document):
        if self.newdoc_if_empty_line:
            document.bundles[0].trees[0].newdoc = True