Source code for udapi.block.util.splitsentence

"""
Block util.SplitSentence will split a given sentence at a given token.
"""
import logging
from udapi.core.block import Block
from udapi.core.root import Root

[docs] class SplitSentence(Block): """ If the sent_id of the current sentence matches the parameter, splits the sentence into two. The first token of the second sentence is also given as a parameter. Alternatively, a MISC attribute can be specified that triggers sentence splitting at the given token. With this approach, multiple sentence splits can be performed during one run. """ def __init__(self, sent_id=None, word_id=None, misc_name=None, misc_value=None, **kwargs): """ Args: sent_id: which sentence should be split (new ids will have A and B appended) word_id: which word should be the first word of the second sentence (tokens and words will be renumbered) misc_name: name of the MISC attribute that can trigger the split (cannot be combined with sent_id and word_id) misc_value: value of the MISC attribute to trigger the split; if not specified, then simple occurrence of the attribute with any value will cause the split MISC attributes that have triggered sentence split will be removed from their node. """ super().__init__(**kwargs) if misc_name: if sent_id or word_id: logging.fatal('Cannot combine misc_value with sent_id or word_id') else: if not sent_id: logging.fatal('Missing parameter sent_id') if not word_id: logging.fatal('Missing parameter word_id') self.sent_id = sent_id self.word_id = word_id self.misc_name = misc_name self.misc_value = misc_value
[docs] def process_document(self, document): for bundle_no, bundle in enumerate(document.bundles): # In general, a bundle may contain multiple trees in different zones. # In UD data, we always expect just one zone (labeled '') per bundle. # This code could be extended to split all zones but we do not try to do it at present. # (The zones may be translations to other languages and it is not likely that we would # want to split each translation at the same position.) if len(bundle.trees) != 1: logging.fatal('Cannot process bundles that have less or more than 1 zone') if not bundle.has_tree(zone=''): logging.fatal('Cannot process bundles that do not have the zone with empty zone id') if self.misc_name: root = bundle.get_tree() split_points = [n for n in root.descendants if n.ord > 1 and n.misc[self.misc_name] and self.misc_value == None or n.misc[self.misc_name] == self.misc_value] if split_points: # Create as many new bundles as there are split points. n_new = len(split_points) current_bid = bundle.bundle_id idletter = 'B' # a letter will be added to bundle ids to distinguish them for i in range(n_new): new_bundle = document.create_bundle() new_bundle.bundle_id = current_bid + idletter new_root = Root(zone='') new_bundle.add_tree(new_root) # Identify nodes to move to the new bundle. first_node_id = split_points[i].ord if i < n_new - 1: next_first_node_id = split_points[i+1].ord nodes_to_move = [n for n in root.descendants if n.ord >= first_node_id and n.ord < next_first_node_id] else: nodes_to_move = [n for n in root.descendants if n.ord >= first_node_id] new_root.steal_nodes(nodes_to_move) self.make_zeros_roots(new_root) new_root.text = new_root.compute_text() # The new bundle was created at the end of the document. # Move it to the position right after the current bundle. document.bundles.pop() document.bundles.insert(bundle_no + i + 1, new_bundle) idletter = chr(ord(idletter) + 1) # Remove from the node the MISC attribute that triggered the sentence split. split_points[i].misc[self.misc_name] = '' # Update the id of the current bundle, fix its zero-dependents and recompute sentence text. bundle.bundle_id += 'A' self.make_zeros_roots(root) root.text = root.compute_text() # Update the bundle numbers of the new bundles and all bundles after them. updated_no = bundle_no + 1 for b in document.bundles[(bundle_no+1):]: b.number = updated_no updated_no += 1 elif bundle.bundle_id == self.sent_id: logging.info('Found!') root = bundle.get_tree() nodes_to_move = [n for n in root.descendants if n.ord >= self.word_id] if len(nodes_to_move) == 0: logging.fatal('No nodes to move to the new sentence; word_id may be out of range') # Create a new bundle at the end of the current document. new_bundle = document.create_bundle() # Move the new bundle to the position right after the current bundle. new_bundle_no = bundle_no + 1 document.bundles.pop() document.bundles.insert(new_bundle_no, new_bundle) updated_no = new_bundle_no for b in document.bundles[new_bundle_no:]: b.number = updated_no updated_no += 1 new_bundle.bundle_id = bundle.bundle_id + 'B' bundle.bundle_id += 'A' new_root = Root(zone='') new_bundle.add_tree(new_root) new_root.steal_nodes(nodes_to_move) # The steal_nodes() method does not make sure that all nodes newly attached # to the artificial root have the 'root' relation. Fix it. self.make_zeros_roots(root) self.make_zeros_roots(new_root) # Update the sentence text attributes of the new sentences. root.text = root.compute_text() new_root.text = new_root.compute_text() # We have found our sentence. No need to process the rest of the document. break
[docs] def make_zeros_roots(self, root): """ The steal_nodes() method does not make sure that all nodes newly attached to the artificial root have the 'root' relation. Fix it. """ n_root = 0 for n in root.descendants: if n.parent.is_root(): n.deprel = 'root' n_root += 1 if n_root > 1: logging.warning('More than one 0:root relation in newly segmented sentence %s.' % root.bundle.bundle_id)