Source code for udapi.block.tokenize.simple

"""Block tokenize.Simple"""
import re

from udapi.block.tokenize.onwhitespace import OnWhitespace


[docs] class Simple(OnWhitespace): """Simple tokenizer, splits on whitespaces and punctuation, fills SpaceAfter=No."""
[docs] @staticmethod def tokenize_sentence(string): """A method to be overriden in subclasses.""" return re.findall(r'\w+|[^\w\s]', string)