Source code for udapi.block.ud.addpuncttype

"""
Some UD treebanks use features PunctType and PunctSide that classify
punctuation symbols. This block can be used to add such features to data where
they are missing – the classification is mostly deterministic. If the input
data already contains such features, their values will be overwritten.
"""
from udapi.core.block import Block

# TODO We need to know the language, there are many other quotation styles,
#      e.g. Finnish and Swedish uses the same symbol for opening and closing: ”X”.
#      Danish uses uses the French quotes, but switched: »X«.

PUNCT_TYPES = {
    '(': 'Brck',
    ')': 'Brck',
    '[': 'Brck',
    ']': 'Brck',
    '{': 'Brck',
    '}': 'Brck',
    '.': 'Peri',
    '...': 'Elip',
    '…': 'Elip',
    ',': 'Comm',
    ';': 'Semi',
    ':': 'Colo',
    '!': 'Excl',
    '¡': 'Excl', # Spanish initial exclamation mark
    '?': 'Qest',
    '¿': 'Qest', # Spanish initial question mark
    '/': 'Colo', # it is used this way in AnCora
    '-': 'Dash',
    '–': 'Dash',
    '—': 'Dash',
    '"': 'Quot',
    "'": 'Quot',
    '`': 'Quot',
    '“': 'Quot', # opening English, closing Czech
    '”': 'Quot', # closing English
    '„': 'Quot', # opening Czech
    '‘': 'Quot', # opening English, closing Czech
    '’': 'Quot', # closing English
    '‚': 'Quot', # opening Czech
    '«': 'Quot', # opening French, closing Danish
    '»': 'Quot', # closing French, opening Danish
    '‹': 'Quot',
    '›': 'Quot',
    '《': 'Quot', # Korean, Chinese
    '》': 'Quot',
    '「': 'Quot', # Chinese, Japanese
    '」': 'Quot',
    '『': 'Quot',
    '』': 'Quot'
}

PUNCT_SIDES = {
    '(': 'Ini',
    ')': 'Fin',
    '[': 'Ini',
    ']': 'Fin',
    '{': 'Ini',
    '}': 'Fin',
    '¡': 'Ini', # Spanish initial exclamation mark
    '!': 'Fin', # but outside Spanish people may expect empty value
    '¿': 'Ini', # Spanish initial question mark
    '?': 'Fin',
    '《': 'Ini', # Korean, Chinese
    '》': 'Fin',
    '「': 'Ini', # Chinese, Japanese
    '」': 'Fin',
    '『': 'Ini',
    '』': 'Fin'
}


[docs] class AddPunctType(Block): """Add features PunctType and PunctSide where applicable."""
[docs] def process_node(self, node): # The two features apply only to PUNCT. If they already occur elsewhere, erase them. if node.upos != 'PUNCT': node.feats['PunctType'] = '' node.feats['PunctSide'] = '' else: if node.form in PUNCT_TYPES: node.feats['PunctType'] = PUNCT_TYPES[node.form] else: node.feats['PunctType'] = '' if node.form in PUNCT_SIDES: node.feats['PunctSide'] = PUNCT_SIDES[node.form] else: node.feats['PunctSide'] = ''