Source code for udapi.block.ud.bg.removedotafterabbr

"""Block ud.bg.RemoveDotAfterAbbr deletes extra PUNCT nodes after abbreviations.

Usage:
udapy -s ud.bg.RemoveDotAfterAbbr < in.conllu > fixed.conllu

Author: Martin Popel
"""
from udapi.core.block import Block


[docs] class RemoveDotAfterAbbr(Block): """Block for deleting extra PUNCT nodes after abbreviations. If an abrreviation is followed by end-sentence period, most languages allow just one period. However, in some treebanks (e.g. UD_Bulgarian v1.4) two periods are annotated:: # text = 1948 г. 1 1948 1948 ADJ 2 г. г. NOUN 3 . . PUNCT The problem is that the `text` comment does not match with the word forms. In https://github.com/UniversalDependencies/docs/issues/410 it was decided that the least-wrong solution (and most common in other treebanks) is to delete the end-sentence punctuation:: # text = 1948 г. 1 1948 1948 ADJ 2 г. г. NOUN This block is not specific for Bulgarian, just that UD_Bulgarian is probably the only treebank where this transformation is needed. """
[docs] def process_tree(self, root): nodes = root.descendants if len(nodes) > 1 and nodes[-1].form == '.' and nodes[-2].form.endswith('.') and root.text: if not (root.text.endswith('..') or root.text.endswith('. .')): nodes[-1].remove(children='rehang_warn')