Source code for udapi.block.ud.goeswithfromtext

"""Block GoeswithFromText for splitting nodes and attaching via goeswith according to the text.

Usage:
udapy -s ud.GoeswithFromText < in.conllu > fixed.conllu

Author: Martin Popel
"""
import logging

from udapi.core.block import Block



[docs]
class GoeswithFromText(Block):
    """Block for splitting nodes and attaching via goeswith according to the the sentence text.

    For example::
    # text = Never the less, I agree.
    1 Nevertheless nevertheless ADV   _ _ 4 advmod _ SpaceAfter=No
    2 ,            ,            PUNCT _ _ 4 punct  _ _
    3 I            I            PRON  _ _ 4 nsubj  _ _
    4 agree        agree        VERB  _ _ 0 root   _ SpaceAfter=No
    5 .            .            PUNCT _ _ 4 punct  _ _

    is changed to::
    # text = Never the less, I agree.
    1 Never  never ADV   _ _ 6 advmod   _ _
    2 the    the   ADV   _ _ 1 goeswith _ _
    3 less   less  ADV   _ _ 1 goeswith _ SpaceAfter=No
    4 ,      ,     PUNCT _ _ 6 punct    _ _
    5 I      I     PRON  _ _ 6 nsubj    _ _
    6 agree  agree VERB  _ _ 0 root     _ SpaceAfter=No
    7 .      .     PUNCT _ _ 6 punct    _ _

    If used with parameter `keep_lemma=1`, the result is::
    # text = Never the less, I agree.
    1 Never  nevertheless ADV   _ _ 6 advmod   _ _
    2 the    _            ADV   _ _ 1 goeswith _ _
    3 less   _            ADV   _ _ 1 goeswith _ SpaceAfter=No
    4 ,      ,            PUNCT _ _ 6 punct    _ _
    5 I      I            PRON  _ _ 6 nsubj    _ _
    6 agree  agree        VERB  _ _ 0 root     _ SpaceAfter=No
    7 .      .            PUNCT _ _ 6 punct    _ _
    """

    def __init__(self, keep_lemma=False, **kwargs):
        super().__init__(**kwargs)
        self.keep_lemma = keep_lemma

    # pylint: disable=too-many-branches

[docs]
    def process_tree(self, root):
        text = root.text
        computed = root.compute_text()
        if text == computed:
            return

        nospace_text = text.replace(' ', '')
        if nospace_text != computed.replace(' ', ''):
            logging.warning('Mismatch of the stored and computed text cannot be solved with '
                            ' ud.AddGoeswithFromText:\n<<%s>>\n<<%s>>', text, computed)
            return

        # Normalize the stored text (double space -> single space)
        text = ' '.join(text.split())

        for node in root.token_descendants:
            nospace_form = node.form.replace(' ', '')
            if text.startswith(node.form):
                text = text[len(node.form):]
                nospace_text = nospace_text[len(nospace_form):]
                if not text or text[0].isspace():
                    del node.misc['SpaceAfter']
                    text = text.lstrip()
                else:
                    node.misc['SpaceAfter'] = 'No'
            elif nospace_text.startswith(nospace_form):
                nospace_text = nospace_text[len(nospace_form):]
                len_raw_form = len(nospace_form)
                while text[:len_raw_form].replace(' ', '') != nospace_form:
                    len_raw_form += 1
                    assert len_raw_form <= len(text)
                raw_form = text[:len_raw_form]
                text = text[len_raw_form:]
                tokens = raw_form.split(' ')
                node.form = tokens[0]
                if not self.keep_lemma:
                    node.lemma = tokens[0].lower()
                del node.misc['SpaceAfter']
                last_node = node
                for token in tokens[1:]:
                    lemma = None if self.keep_lemma else token
                    child = node.create_child(form=token, lemma=lemma, upos=node.upos,
                                              xpos=node.xpos, deprel='goeswith')
                    child.shift_after_node(last_node)
                    last_node = child
                if not text or text[0].isspace():
                    text = text.lstrip()
                else:
                    last_node.misc['SpaceAfter'] = 'No'
            else:
                assert False  # we have checked the whole sentence already
        if text:
            logging.warning('Extra text "%s" in tree %s', text, root)