Source code for udapi.block.tokenize.onwhitespace

"""Block tokenize.OnWhitespace"""
import re
from udapi.core.block import Block


[docs] class OnWhitespace(Block): """Base tokenizer, splits on whitespaces, fills SpaceAfter=No. Use the parameter `keep_spaces=True` to preserve all whitespaces in the sentence in the UDPipe way, i.e. using the `SpacesAfter` and `SpacesBefore` features in the MISC field. It is backward compatible with CoNLL-U v2 `SpaceAfter=No` feature. That is, no following whitespace is marked by `SpaceAfter=No` and a single following space results in no whitespace-related markup. If loading the text using `read.Sentences` and all whitespaces need to be preserved (in order to be able to reconstruct the original document), the `read.Sentences` block must be called with `rstrip=''`, `rstrip=\n` or `rstrip=\r\n` to prevent stripping the trailing whitespace, e.g.:: $> echo -e "Hello \t world " | udapy read.Sentences $'rstrip=\r\n' tokenize.OnWhitespace keep_spaces=1 write.Conllu # sent_id = 1 # text = Hello world 1 Hello _ _ _ _ 0 _ _ SpacesAfter=\s\t\s 2 world _ _ _ _ 0 _ _ _ Note that the attribute `SpaceAfter=No` is missing for the token `world`, since it is followed by a single space. Parameters ---------- keep_spaces : bool preserve whitespaces by filling MISC attributes `SpacesAfter` and `SpacesBefore` (by default False) """ escape_whitespace_table = str.maketrans({' ':r'\s', '\t':r'\t', '\r':r'\r', '\n':r'\n'}) def __init__(self, keep_spaces=False, **kwargs): super().__init__(**kwargs) self.keep_spaces = keep_spaces
[docs] @staticmethod def tokenize_sentence(string): """A method to be overriden in subclasses.""" return string.split()
[docs] def process_tree(self, root): if root.children: raise ValueError('Tree %s is already tokenized.' % root) #sentence = ' '.join(root.text.split()) sentence = root.text tokens = self.tokenize_sentence(sentence) # Check if there are any spaces before the first token spaces_before = "" m = re.match(r'\s+', sentence) if m: spaces_before = m.group(0) sentence = sentence[len(spaces_before):] for i, token in enumerate(tokens, 1): spaces_after = "" # The token (returned from tokenization) does not match the start of sentence. # E.g. '. . . word' is tokenized as '... word'. if not sentence.startswith(token): # Let's delete the start of sentence anyway, # using a non-greedy regex and the expected next token # returned from the tokenization. # my $next_token = $tokens[$i+1]; # my ($first, $rest) = ($sentence =~ /^(.*?)(\Q$next_token\E.*)$/); # $no_space_after = 1 if (defined $first && $first !~ /\s$/); # $sentence = $rest if (defined $rest); raise ValueError('tokenization does not match: "%s" vs "%s"' % (token, sentence)) # Delete the token from the begining of the sentence. sentence = sentence[len(token):] # Set the SpaceAfter and SpacesAfter properly m = re.match(r'\s+', sentence) if m is not None: spaces_after = m.group(0) sentence = sentence[len(spaces_after):] # normalize whitespace if not self.keep_spaces: spaces_before = "" # spaces_after = "" <=> SpaceAfter=No is never set for the last token <=> len(sentence) = 0 spaces_after = "" if not len(spaces_after) and len(sentence) else " " # create a new node node = root.create_child(form=token) node.ord = i if i == 1 and spaces_before: node.misc["SpacesBefore"] = spaces_before.translate(self.escape_whitespace_table) if not spaces_after: node.misc["SpaceAfter"] = 'No' elif spaces_after != " ": node.misc["SpacesAfter"] = spaces_after.translate(self.escape_whitespace_table)