Source code for udapi.block.ud.splittoken

"""
Block ud.SplitToken will split a given token into multiple tokens.
"""
from udapi.core.block import Block
import re
import logging


[docs] class SplitToken(Block): """ Split a token into two or more. A MISC attribute is used to mark the tokens that should be split. (The attribute may have been set by an annotator or by a previous block that tests the specific conditions under which splitting is desired.) Multiword tokens are currently not supported: The node to be split cannot belong to a MWT. Note that the result will not be a MWT either (use the block ud.AddMwt if that is desired). There will be simply a new attribute SpaceAfter=No, possibly accompanied by CorrectSpaceAfter=Yes (indicating that this was an error in the source text). """ def __init__(self, misc_name='SplitToken', **kwargs): """ Args: misc_name: name of the MISC attribute that can trigger the splitting default: SplitToken The value of the attribute should indicate where to split the token. It should be a string that is identical to node.form except that there is one or more spaces where the token should be split. """ super().__init__(**kwargs) self.misc_name = misc_name
[docs] def process_node(self, node): """ The SplitToken (or equivalent) attribute in MISC will trigger action. Either the current node will be split to multiple nodes and the attribute will be removed from MISC, or a warning will be issued that the splitting cannot be done and the attribute will stay in MISC. Note that multiword token lines and empty nodes are not even scanned for the attribute, so if it is there, it will stay there but no warning will be printed. """ value = node.misc[self.misc_name] if value == '': return if node.multiword_token: logging.warning(f"MISC {self.misc_name} cannot be used if the node belongs to a multiword token.") node.misc['Bug'] = 'SplittingTokenNotSupportedHere' return ###!!! This block currently must not be applied on data containing ###!!! enhanced dependencies. We must first implement adjustments of ###!!! the enhanced structure. if node.deps: logging.fatal('At present this block cannot be applied to data with enhanced dependencies.') # Verify that the value of the MISC attribute can be used as specification # of the split. if re.match(r'^\s', value) or re.search(r'\s$', value) or re.search(r'\s\s', value): logging.warning(f"MISC {self.misc_name} is '{value}'; leading spaces, trailing spaces or multiple consecutive spaces are not allowed.") node.misc['Bug'] = f'{self.misc_name}BadValue' return if re.search(r'\s', node.form): logging.warning(f"MISC {self.misc_name} cannot be used with nodes whose forms contain a space (here '{node.form}').") node.misc['Bug'] = 'SplittingTokenNotSupportedHere' return if re.sub(r' ', '', value) != node.form: logging.warning(f"MISC {self.misc_name} value '{value}' does not match the word form '{node.form}'.") node.misc['Bug'] = f'{self.misc_name}BadValue' return # Do the split. space_after = node.misc['SpaceAfter'] forms = value.split(' ') # Optionally, SplitTokenMorpho in MISC can have the morphological annotation # of the new tokens. For example: # SplitTokenMorpho=LEMMA=popisovat\tUPOS=VERB\tFEATS=Aspect=Imp\\pMood=Ind\\pNumber=Sing\\pPerson=3\\pPolarity=Pos\\pTense=Pres\\pVerbForm=Fin\\pVoice=Act if node.misc['SplitTokenMorpho'] != '': morphoblocks = [''] + node.misc['SplitTokenMorpho'].split(' ') del node.misc['SplitTokenMorpho'] else: morphoblocks = ['' for x in forms] node.form = forms[0] last_node = node for form, morpho in zip(forms[1:], morphoblocks[1:]): last_node.misc['SpaceAfter'] = 'No' last_node.misc['CorrectSpaceAfter'] = 'Yes' lemma = form upos = node.upos feats = str(node.feats) xpos = node.xpos if morpho != '': cols = morpho.split('\\t') for c in cols: colname, value = c.split('=', 1) if colname == 'LEMMA': lemma = value elif colname == 'UPOS': upos = value elif colname == 'FEATS': feats = re.sub(r'\\p', '|', value) elif colname == 'XPOS': xpos = value else: logging.fatal(f"c = {c}") new_node = node.create_child(form=form, lemma=lemma, upos=upos, feats=feats, xpos=xpos, deprel='dep') new_node.shift_after_node(last_node) last_node = new_node last_node.misc['SpaceAfter'] = space_after del node.misc[self.misc_name]