Source code for udapi.block.ud.de.fixgsd

"""
Block to fix annotation of UD German-GSD.
"""
from udapi.core.block import Block
import logging
import re

[docs] class FixGSD(Block):
[docs] def process_node(self, node): """ Normalizes tokenization, lemmatization and tagging of ordinal numerals that are expressed using digits followed by a period. https://github.com/UniversalDependencies/UD_German-GSD/issues/24 """ # Ignore periods that terminate a sentence, although they could belong # to an ordinal numeral at the same time. if node.form == '.' and node.next_node: # Ignore number+period combinations that have an intervening space. if node.prev_node and re.match(r'^\d+$', node.prev_node.form) and node.prev_node.no_space_after: # Merge the number and the period into one token. number = node.prev_node period = node # The period should not have any children but if it does, re-attach them to the number. for c in period.children: c.parent = number # The period should be followed by a space but if it isn't, mark it at the number. number.misc['SpaceAfter'] = 'No' if period.no_space_after else '' number.form += '.' number.lemma = number.form number.upos = 'ADJ' number.xpos = 'ADJA' number.feats = '_' number.feats['NumType'] = 'Ord' if number.udeprel == 'nummod': number.deprel = 'amod' period.remove() # Even if the digits and the period are already in one token, check their annotation. if re.match(r'^\d+\.$', node.form): node.lemma = node.form node.upos = 'ADJ' node.xpos = 'ADJA' node.feats = '_' node.feats['NumType'] = 'Ord' if node.udeprel == 'nummod': node.deprel = 'amod' # Finally, make sure that ordinal numerals expressed verbosely are tagged properly. # Unlike for digits, do not remove the features for Gender, Number, and Case. # Skip 'acht' because we cannot reliably distinguish it from the cardinal numeral and from the verb 'achten'. if re.match(r'^(erst|zweit|dritt|viert|fünft|sechst|siebt|neunt|(drei|vier|fünf|sechs|sieb|acht|neun)?zehnt|elft|zwölft)(er)?$', node.lemma, re.IGNORECASE): # Skip 'erst' that is used as an adverb. if node.lemma != 'erst' or node.upos != 'ADV': node.lemma = re.sub(r'^(.+)er$', r'\1', node.lemma) node.upos = 'ADJ' node.xpos = 'ADJA' node.feats['NumType'] = 'Ord' if node.udeprel == 'nummod': node.deprel = 'amod'