Source code for udapi.block.ud.es.fixexclamation

"""Block to fix tokenization of exclamation marks in UD Spanish-AnCora."""
from udapi.core.block import Block
import logging
import re


[docs]
class FixExclamation(Block):


[docs]
    def process_node(self, node):
        """
        In Spanish AnCora, there are things like '¡Hola!' as one token.
        The punctuation should be separated. One may question whether this
        should include names of companies (Yahoo!) or products (la revista
        Hello!) but it should, as company and product names often have
        multiple tokens (even multiple full words, not just punctuation)
        and these are also separated in UD.
        """
        if re.search(r'^[¡!]\w', node.form):
            # Separate the punctuation and attach it to the rest.
            punct = node.create_child()
            punct.shift_before_node(node)
            punct.form = node.form[:1]
            node.form = node.form[1:]
            punct.lemma = punct.form
            punct.upos = 'PUNCT'
            punct.xpos = 'faa' if punct.form == '¡' else 'fat'
            punct.feats['PunctType'] = 'Excl'
            punct.feats['PunctSide'] = 'Ini' if punct.form == '¡' else 'Fin'
            punct.misc['SpaceAfter'] = 'No'
            punct.deprel = 'punct'
            # Mark the position for manual check.
            node.misc['Mark'] = 'PunctSep'
        if re.search(r'\w[¡!]$', node.form):
            # Separate the punctuation and attach it to the rest.
            punct = node.create_child()
            punct.shift_after_node(node)
            punct.form = node.form[-1:]
            node.form = node.form[:-1]
            punct.lemma = punct.form
            punct.upos = 'PUNCT'
            punct.xpos = 'faa' if punct.form == '¡' else 'fat'
            punct.feats['PunctType'] = 'Excl'
            punct.feats['PunctSide'] = 'Ini' if punct.form == '¡' else 'Fin'
            punct.misc['SpaceAfter'] = node.misc['SpaceAfter']
            node.misc['SpaceAfter'] = 'No'
            punct.deprel = 'punct'
            # Mark the position for manual check.
            node.misc['Mark'] = 'PunctSep'