Source code for udapi.block.ud.bg.removedotafterabbr

"""Block ud.bg.RemoveDotAfterAbbr deletes extra PUNCT nodes after abbreviations.

Usage:
udapy -s ud.bg.RemoveDotAfterAbbr < in.conllu > fixed.conllu

Author: Martin Popel
"""
from udapi.core.block import Block



[docs]
class RemoveDotAfterAbbr(Block):
    """Block for deleting extra PUNCT nodes after abbreviations.

    If an abrreviation is followed by end-sentence period, most languages allow just one period.
    However, in some treebanks (e.g. UD_Bulgarian v1.4) two periods are annotated::
    # text = 1948 г.
    1  1948  1948  ADJ
    2  г.    г.    NOUN
    3  .     .     PUNCT

    The problem is that the `text` comment does not match with the word forms.
    In https://github.com/UniversalDependencies/docs/issues/410 it was decided that the least-wrong
    solution (and most common in other treebanks) is to delete the end-sentence punctuation::
    # text = 1948 г.
    1  1948  1948  ADJ
    2  г.    г.    NOUN

    This block is not specific for Bulgarian, just that UD_Bulgarian is probably the only treebank
    where this transformation is needed.
    """


[docs]
    def process_tree(self, root):
        nodes = root.descendants
        if len(nodes) > 1 and nodes[-1].form == '.' and nodes[-2].form.endswith('.') and root.text:
            if not (root.text.endswith('..') or root.text.endswith('. .')):
                nodes[-1].remove(children='rehang_warn')