Source code for udapi.block.util.splitsentence

"""
Block util.SplitSentence will split a given sentence at a given token.
"""
import logging
from udapi.core.block import Block
from udapi.core.root import Root


[docs]
class SplitSentence(Block):
    """
    If the sent_id of the current sentence matches the parameter, splits the
    sentence into two. The first token of the second sentence is also given as
    a parameter.

    Alternatively, a MISC attribute can be specified that triggers sentence
    splitting at the given token. With this approach, multiple sentence splits
    can be performed during one run.
    """

    def __init__(self, sent_id=None, word_id=None, misc_name=None, misc_value=None, **kwargs):
        """
        Args:
        sent_id: which sentence should be split (new ids will have A and B appended)
        word_id: which word should be the first word of the second sentence (tokens and words will be renumbered)
        misc_name: name of the MISC attribute that can trigger the split (cannot be combined with sent_id and word_id)
        misc_value: value of the MISC attribute to trigger the split; if not specified, then simple occurrence of the attribute with any value will cause the split
            MISC attributes that have triggered sentence split will be removed from their node.
        """
        super().__init__(**kwargs)
        if misc_name:
            if sent_id or word_id:
                logging.fatal('Cannot combine misc_value with sent_id or word_id')
        else:
            if not sent_id:
                logging.fatal('Missing parameter sent_id')
            if not word_id:
                logging.fatal('Missing parameter word_id')
        self.sent_id = sent_id
        self.word_id = word_id
        self.misc_name = misc_name
        self.misc_value = misc_value


[docs]
    def process_document(self, document):
        for bundle_no, bundle in enumerate(document.bundles):
            # In general, a bundle may contain multiple trees in different zones.
            # In UD data, we always expect just one zone (labeled '') per bundle.
            # This code could be extended to split all zones but we do not try to do it at present.
            # (The zones may be translations to other languages and it is not likely that we would
            # want to split each translation at the same position.)
            if len(bundle.trees) != 1:
                logging.fatal('Cannot process bundles that have less or more than 1 zone')
            if not bundle.has_tree(zone=''):
                logging.fatal('Cannot process bundles that do not have the zone with empty zone id')
            if self.misc_name:
                root = bundle.get_tree()
                split_points = [n for n in root.descendants if n.ord > 1 and n.misc[self.misc_name] and self.misc_value == None or n.misc[self.misc_name] == self.misc_value]
                if split_points:
                    # Create as many new bundles as there are split points.
                    n_new = len(split_points)
                    current_bid = bundle.bundle_id
                    idletter = 'B' # a letter will be added to bundle ids to distinguish them
                    for i in range(n_new):
                        new_bundle = document.create_bundle()
                        new_bundle.bundle_id = current_bid + idletter
                        new_root = Root(zone='')
                        new_bundle.add_tree(new_root)
                        # Identify nodes to move to the new bundle.
                        first_node_id = split_points[i].ord
                        if i < n_new - 1:
                            next_first_node_id = split_points[i+1].ord
                            nodes_to_move = [n for n in root.descendants if n.ord >= first_node_id and n.ord < next_first_node_id]
                        else:
                            nodes_to_move = [n for n in root.descendants if n.ord >= first_node_id]
                        new_root.steal_nodes(nodes_to_move)
                        self.make_zeros_roots(new_root)
                        new_root.text = new_root.compute_text()
                        # The new bundle was created at the end of the document.
                        # Move it to the position right after the current bundle.
                        document.bundles.pop()
                        document.bundles.insert(bundle_no + i + 1, new_bundle)
                        idletter = chr(ord(idletter) + 1)
                        # Remove from the node the MISC attribute that triggered the sentence split.
                        split_points[i].misc[self.misc_name] = ''
                    # Update the id of the current bundle, fix its zero-dependents and recompute sentence text.
                    bundle.bundle_id += 'A'
                    self.make_zeros_roots(root)
                    root.text = root.compute_text()
                    # Update the bundle numbers of the new bundles and all bundles after them.
                    updated_no = bundle_no + 1
                    for b in document.bundles[(bundle_no+1):]:
                        b.number = updated_no
                        updated_no += 1
            elif bundle.bundle_id == self.sent_id:
                logging.info('Found!')
                root = bundle.get_tree()
                nodes_to_move = [n for n in root.descendants if n.ord >= self.word_id]
                if len(nodes_to_move) == 0:
                    logging.fatal('No nodes to move to the new sentence; word_id may be out of range')
                # Create a new bundle at the end of the current document.
                new_bundle = document.create_bundle()
                # Move the new bundle to the position right after the current bundle.
                new_bundle_no = bundle_no + 1
                document.bundles.pop()
                document.bundles.insert(new_bundle_no, new_bundle)
                updated_no = new_bundle_no
                for b in document.bundles[new_bundle_no:]:
                    b.number = updated_no
                    updated_no += 1
                new_bundle.bundle_id = bundle.bundle_id + 'B'
                bundle.bundle_id += 'A'
                new_root = Root(zone='')
                new_bundle.add_tree(new_root)
                new_root.steal_nodes(nodes_to_move)
                # The steal_nodes() method does not make sure that all nodes newly attached
                # to the artificial root have the 'root' relation. Fix it.
                self.make_zeros_roots(root)
                self.make_zeros_roots(new_root)
                # Update the sentence text attributes of the new sentences.
                root.text = root.compute_text()
                new_root.text = new_root.compute_text()
                # We have found our sentence. No need to process the rest of the document.
                break



[docs]
    def make_zeros_roots(self, root):
        """
        The steal_nodes() method does not make sure that all nodes newly attached
        to the artificial root have the 'root' relation. Fix it.
        """
        n_root = 0
        for n in root.descendants:
            if n.parent.is_root():
                n.deprel = 'root'
                n_root += 1
        if n_root > 1:
            logging.warning('More than one 0:root relation in newly segmented sentence %s.' % root.bundle.bundle_id)