Source code for udapi.block.ud.joinasmwt

"""Block ud.JoinAsMwt for creating multi-word tokens

if multiple neighboring words are not separated by a space
and the boundaries between the word forms are alphabetical.
"""
from udapi.core.block import Block


[docs] class JoinAsMwt(Block): """Create MWTs if words are not separated by a space..""" def __init__(self, revert_orig_form=True, **kwargs): """Args: revert_orig_form: if any node of the newly created MWT has `misc['OrigForm']`, it is used as the FORM (and deleted from MISC). Useful after `ud.ComplyWithText`. Default=True. """ super().__init__(**kwargs) self.revert_orig_form = revert_orig_form
[docs] def process_node(self, node): if node.multiword_token: return mwt_nodes = [node] while (node.next_node and not node.next_node.multiword_token and self.should_join(node, node.next_node)): node = node.next_node mwt_nodes.append(node) if len(mwt_nodes) > 1: self.create_mwt(mwt_nodes)
[docs] def should_join(self, node, next_node): return node.no_space_after and node.form[-1].isalpha() and next_node.form[0].isalpha()
[docs] def create_mwt(self, mwt_nodes): mwt_form = ''.join([n.form for n in mwt_nodes]) mwt = mwt_nodes[0].root.create_multiword_token(words=mwt_nodes, form=mwt_form) if mwt_nodes[0].node.misc['SpaceAfter'] == 'No': mwt.misc['SpaceAfter'] = 'No' for mwt_node in mwt_nodes: del mwt_node.misc['SpaceAfter'] if self.revert_orig_form: for mwt_node in mwt_nodes: if mwt_node.misc['OrigForm']: mwt_node.form = mwt_node.misc['OrigForm'] del mwt_node.misc['OrigForm'] self.postprocess_mwt()
# a helper method to be overriden
[docs] def postprocess_mwt(self, mwt): pass