Source code for udapi.block.ud.splittoken

"""
Block ud.SplitToken will split a given token into multiple tokens.
"""
from udapi.core.block import Block
import re
import logging



[docs]
class SplitToken(Block):
    """
    Split a token into two or more. A MISC attribute is used to mark the tokens
    that should be split. (The attribute may have been set by an annotator or
    by a previous block that tests the specific conditions under which splitting
    is desired.) Multiword tokens are currently not supported: The node to be
    split cannot belong to a MWT. Note that the result will not be a MWT either
    (use the block ud.AddMwt if that is desired). There will be simply a new
    attribute SpaceAfter=No, possibly accompanied by CorrectSpaceAfter=Yes
    (indicating that this was an error in the source text).
    """

    def __init__(self, misc_name='SplitToken', **kwargs):
        """
        Args:
        misc_name: name of the MISC attribute that can trigger the splitting
            default: SplitToken
            The value of the attribute should indicate where to split the token.
            It should be a string that is identical to node.form except that
            there is one or more spaces where the token should be split.
        """
        super().__init__(**kwargs)
        self.misc_name = misc_name


[docs]
    def process_node(self, node):
        """
        The SplitToken (or equivalent) attribute in MISC will trigger action.
        Either the current node will be split to multiple nodes and the
        attribute will be removed from MISC, or a warning will be issued that
        the splitting cannot be done and the attribute will stay in MISC. Note
        that multiword token lines and empty nodes are not even scanned for
        the attribute, so if it is there, it will stay there but no warning
        will be printed.
        """
        value = node.misc[self.misc_name]
        if value == '':
            return
        if node.multiword_token:
            logging.warning(f"MISC {self.misc_name} cannot be used if the node belongs to a multiword token.")
            node.misc['Bug'] = 'SplittingTokenNotSupportedHere'
            return
        ###!!! This block currently must not be applied on data containing
        ###!!! enhanced dependencies. We must first implement adjustments of
        ###!!! the enhanced structure.
        if node.deps:
            logging.fatal('At present this block cannot be applied to data with enhanced dependencies.')
        # Verify that the value of the MISC attribute can be used as specification
        # of the split.
        if re.match(r'^\s', value) or re.search(r'\s$', value) or re.search(r'\s\s', value):
            logging.warning(f"MISC {self.misc_name} is '{value}'; leading spaces, trailing spaces or multiple consecutive spaces are not allowed.")
            node.misc['Bug'] = f'{self.misc_name}BadValue'
            return
        if re.search(r'\s', node.form):
            logging.warning(f"MISC {self.misc_name} cannot be used with nodes whose forms contain a space (here '{node.form}').")
            node.misc['Bug'] = 'SplittingTokenNotSupportedHere'
            return
        if re.sub(r' ', '', value) != node.form:
            logging.warning(f"MISC {self.misc_name} value '{value}' does not match the word form '{node.form}'.")
            node.misc['Bug'] = f'{self.misc_name}BadValue'
            return
        # Do the split.
        space_after = node.misc['SpaceAfter']
        forms = value.split(' ')
        # Optionally, SplitTokenMorpho in MISC can have the morphological annotation
        # of the new tokens. For example:
        # SplitTokenMorpho=LEMMA=popisovat\tUPOS=VERB\tFEATS=Aspect=Imp\\pMood=Ind\\pNumber=Sing\\pPerson=3\\pPolarity=Pos\\pTense=Pres\\pVerbForm=Fin\\pVoice=Act
        if node.misc['SplitTokenMorpho'] != '':
            morphoblocks = [''] + node.misc['SplitTokenMorpho'].split(' ')
            del node.misc['SplitTokenMorpho']
        else:
            morphoblocks = ['' for x in forms]
        node.form = forms[0]
        last_node = node
        for form, morpho in zip(forms[1:], morphoblocks[1:]):
            last_node.misc['SpaceAfter'] = 'No'
            last_node.misc['CorrectSpaceAfter'] = 'Yes'
            lemma = form
            upos = node.upos
            feats = str(node.feats)
            xpos = node.xpos
            if morpho != '':
                cols = morpho.split('\\t')
                for c in cols:
                    colname, value = c.split('=', 1)
                    if colname == 'LEMMA':
                        lemma = value
                    elif colname == 'UPOS':
                        upos = value
                    elif colname == 'FEATS':
                        feats = re.sub(r'\\p', '|', value)
                    elif colname == 'XPOS':
                        xpos = value
                    else:
                        logging.fatal(f"c = {c}")
            new_node = node.create_child(form=form, lemma=lemma, upos=upos, feats=feats, xpos=xpos, deprel='dep')
            new_node.shift_after_node(last_node)
            last_node = new_node
        last_node.misc['SpaceAfter'] = space_after
        del node.misc[self.misc_name]