Source code for udapi.block.ud.ro.setspaceafter

"""Block ud.ro.SetSpaceAfter for heuristic setting of SpaceAfter=No in Romanian.

Usage::

  udapy -s ud.ro.SetSpaceAfter < in.conllu > fixed.conllu

Author: Martin Popel
"""
import re

import udapi.block.ud.setspaceafter


[docs] class SetSpaceAfter(udapi.block.ud.setspaceafter.SetSpaceAfter): """Block for heuristic setting of the SpaceAfter=No MISC attribute in Romanian. Romanian uses many contractions, e.g. ======= ======= ========= ========== raw meaning tokenized lemmatized ======= ======= ========= ========== n-ar nu ar n- ar nu avea să-i să îi să -i să el într-o în o într- o întru un nu-i nu îi nu -i nu el nu-i nu e nu -i nu fi ======= ======= ========= ========== Detokenization is quite simple: no space after word-final hyphen and before word-initial hyphen. There are just two exceptions, I have found: * "-" the hyphen itself (most probably it means a dash separating phrases/clauses) * negative numbers, e.g. "-3,1" """
[docs] def process_tree(self, root): nodes = root.descendants for i, node in enumerate(nodes[:-1]): # Mark contractions like -i, -și, -l, -urilor, but not negative numbers like -12,3. # Store SpaceAfter=No to the previous node. next_form = nodes[i + 1].form if re.match('-.*[^0-9,.]', next_form): self.mark_no_space(node) # Mark contractions like s-, într-, și-, printr-. if node.form[-1] == '-' and node.form != '-': self.mark_no_space(node) super().process_tree(root)