Source code for udapi.block.util.splitsentence

"""
Block util.SplitSentence will split a given sentence at a given token.
"""
import logging
from udapi.core.block import Block
from udapi.core.root import Root

[docs] class SplitSentence(Block): """ If the sent_id of the current sentence matches the parameter, splits the sentence into two. The first token of the second sentence is also given as a parameter. """ def __init__(self, sent_id=None, word_id=None, **kwargs): """ Args: sent_id: which sentence should be split (new ids will have A and B appended) word_id: which word should be the first word of the second sentence (tokens and words will be renumbered) """ super().__init__(**kwargs) if not sent_id: logging.fatal('Missing parameter sent_id') if not word_id: logging.fatal('Missing parameter word_id') self.sent_id = sent_id self.word_id = word_id
[docs] def process_document(self, document): for bundle_no, bundle in enumerate(document.bundles): if bundle.bundle_id == self.sent_id: logging.info('Found!') # In general, a bundle may contain multiple trees in different zones. # In UD data, we always expect just one zone (labeled '') per bundle. # This code could be extended to split all zones but we do not try to do it at present. # (The zones may be translations to other languages and it is not likely that we would # want to split each translation at the same position.) if len(bundle.trees) != 1: logging.fatal('Cannot process bundles that have less or more than 1 zone') if not bundle.has_tree(zone=''): logging.fatal('Cannot process bundles that do not have the zone with empty zone id') root = bundle.get_tree() nodes_to_move = [n for n in root.descendants if n.ord >= self.word_id] if len(nodes_to_move) == 0: logging.fatal('No nodes to move to the new sentence; word_id may be out of range') # Create a new bundle at the end of the current document. new_bundle = document.create_bundle() # Move the new bundle to the position right after the current bundle. new_bundle_no = bundle_no + 1 document.bundles.pop() document.bundles.insert(new_bundle_no, new_bundle) updated_no = new_bundle_no for b in document.bundles[new_bundle_no:]: b.number = updated_no updated_no += 1 new_bundle.bundle_id = bundle.bundle_id + 'B' bundle.bundle_id += 'A' new_root = Root(zone='') new_bundle.add_tree(new_root) new_root.steal_nodes(nodes_to_move) # The steal_nodes() method does not make sure that all nodes newly attached # to the artificial root have the 'root' relation. Fix it. n_root = 0 for n in root.descendants: if n.parent.is_root(): n.deprel = 'root' n_root += 1 if n_root > 1: logging.warning('More than one 0:root relation in the first part of the sentence.') n_root = 0 for n in new_root.descendants: if n.parent.is_root(): n.deprel = 'root' n_root += 1 if n_root > 1: logging.warning('More than one 0:root relation in the second part of the sentence.') # Update the sentence text attributes of the new sentences. root.text = root.compute_text() new_root.text = new_root.compute_text() # We have found our sentence. No need to process the rest of the document. break