Source code for udapi.block.ud.de.fixhdt

"""
Block to fix annotation of UD German-HDT.

It was created independently of ud.de.AddMwt but it aims to do essentially the
same thing. Future work: make the two blocks converge.

Currently known differences:
- This block covers a wider range of contractions.
- This block generates morphological features for the syntactic words.
- This block does not touch words that look like contractions but do not have PronType=Art (this is a reliable indicator in HDT).
- This block overrides the default attachment when the original relation is root, conj, reparandum.
- The other block takes advantage of the generic class ud.AddMwt, so it does not have to re-invent common procedures.
"""
from udapi.core.block import Block
import logging
import re

[docs] class FixHDT(Block):
[docs] def process_node(self, node): # PronType=Art with ADP is wrong. Fused prepositions and articles should be decomposed in UD. # The following contractions have been observed: # a. am ans aufs beim durchs fürs hinterm hinters im ins übers ums unterm unters vom vorm vors z. zum zur if node.upos == 'ADP' and node.feats['PronType'] == 'Art': if re.match("^(a\.|am|ans|aufs|beim|durchs|fürs|hinter[ms]|im|ins|übers|ums|unter[ms]|vom|vor[ms]|z\.|zu[mr])$", node.form, re.IGNORECASE): # We need two nodes instead of one. Create a node. # The parent should not be the root but unfortunately it is not guaranteed. node2 = node.create_child() node2.shift_after_node(node) if not re.match(r"^(root|conj|reparandum)$", node.udeprel): node2.parent = node.parent node.deprel = 'case' node2.deprel = 'det' mwt = node.root.create_multiword_token(form=node.form, words=[node, node2], misc=node.misc) node.misc['SpaceAfter'] = '' # We want to respect the original letter case in the forms of the syntactic words. # We can use the isupper() method to find out whether all letters are uppercase. # However, detecting first-letter capitalization requires more work. up = 2 if mwt.form.isupper() else 1 if mwt.form[:1].isupper() else 0 up2 = 2 if up == 2 else 0 if re.match(r"^(a\.|am|ans)$", mwt.form, re.IGNORECASE): node.form = mimic_case(up, 'an') node.lemma = 'an' elif re.match(r"^aufs$", mwt.form, re.IGNORECASE): node.form = mimic_case(up, 'auf') node.lemma = 'auf' elif re.match(r"^beim$", mwt.form, re.IGNORECASE): node.form = mimic_case(up, 'bei') node.lemma = 'bei' elif re.match(r"^durchs$", mwt.form, re.IGNORECASE): node.form = mimic_case(up, 'durch') node.lemma = 'durch' elif re.match(r"^fürs$", mwt.form, re.IGNORECASE): node.form = mimic_case(up, 'für') node.lemma = 'für' elif re.match(r"^hinter[ms]$", mwt.form, re.IGNORECASE): node.form = mimic_case(up, 'hinter') node.lemma = 'hinter' elif re.match(r"^(im|ins)$", mwt.form, re.IGNORECASE): node.form = mimic_case(up, 'in') node.lemma = 'in' elif re.match(r"^übers$", mwt.form, re.IGNORECASE): node.form = mimic_case(up, 'über') node.lemma = 'über' elif re.match(r"^ums$", mwt.form, re.IGNORECASE): node.form = mimic_case(up, 'um') node.lemma = 'um' elif re.match(r"^unter[ms]$", mwt.form, re.IGNORECASE): node.form = mimic_case(up, 'unter') node.lemma = 'unter' elif re.match(r"^vom$", mwt.form, re.IGNORECASE): node.form = mimic_case(up, 'von') node.lemma = 'von' elif re.match(r"^vor[ms]$", mwt.form, re.IGNORECASE): node.form = mimic_case(up, 'vor') node.lemma = 'vor' elif re.match(r"^(z\.|zu[mr])$", mwt.form, re.IGNORECASE): node.form = mimic_case(up, 'zu') node.lemma = 'zu' node.upos = 'ADP' node.xpos = 'APPR' node.feats = '_' node.feats['AdpType'] = 'Prep' # We must use search() because match() only checks at the beginning of the string. if re.search("[m\.]$", mwt.form, re.IGNORECASE): node2.form = mimic_case(up2, 'dem') node2.feats = 'Case=Dat|Definite=Def|Gender=Masc,Neut|Number=Sing|PronType=Art' node.feats['Case'] = 'Dat' node2.lemma = 'der' elif re.search("s$", mwt.form, re.IGNORECASE): node2.form = mimic_case(up2, 'das') node2.feats = 'Case=Acc|Definite=Def|Gender=Neut|Number=Sing|PronType=Art' node.feats['Case'] = 'Acc' node2.lemma = 'der' elif re.search("r$", mwt.form, re.IGNORECASE): node2.form = mimic_case(up2, 'der') node2.feats = 'Case=Dat|Definite=Def|Gender=Fem|Number=Sing|PronType=Art' node.feats['Case'] = 'Dat' node2.lemma = 'der' node2.upos = 'DET' node2.xpos = 'ART'
[docs] def mimic_case(up, x): if up >= 2: return x.upper() elif up == 1: return x[:1].upper() + x[1:].lower() else: return x.lower()