Source code for udapi.block.ud.id.fixgsd

"""Block to fix annotation of UD Indonesian-GSD."""
from udapi.core.block import Block
import logging
import re

[docs] class FixGSD(Block):
[docs] def fix_upos_based_on_morphind(self, node): """ Example from data: ("kesamaan"), the correct UPOS is NOUN, as suggested by MorphInd. Based on my observation so far, if there is a different UPOS between the original GSD and MorphInd, it's better to trust MorphInd I found so many incorrect UPOS in GSD, especially when NOUNs become VERBs and VERBs become NOUNs. I suggest adding Voice=Pass when the script decides ke-xxx-an as VERB. """ if node.upos == 'VERB' and node.xpos == 'NSD' and re.match(r'^ke.+an$', node.form, re.IGNORECASE): node.upos = 'NOUN' if node.udeprel == 'acl': node.deprel = 'nmod' elif node.udeprel == 'advcl': node.deprel = 'obl'
[docs] def fix_semua(self, node): """ Indonesian "semua" means "everything, all". Originally it was DET, PRON, or ADV. Ika: I usually only labeled "semua" as DET only if it's followed by a NOUN/PROPN. If it's followed by DET (including '-nya' as DET) or it's not followed by any NOUN/DET, I labeled them as PRON. """ if node.form.lower() == 'semua': if re.match(r'^(NOUN|PROPN)$', node.parent.upos) and node.parent.ord > node.ord: node.upos = 'DET' if node.udeprel == 'nmod' or node.udeprel == 'advmod': node.deprel = 'det' else: node.upos = 'PRON' if node.udeprel == 'det' or node.udeprel == 'advmod': node.deprel = 'nmod' node.feats['PronType'] = 'Tot'
[docs] def fix_ordinal_numerals(self, node): """ Ordinal numerals should be ADJ NumType=Ord in UD. They have many different UPOS tags in Indonesian GSD. This method harmonizes them. pertama = first kedua = second ketiga = third keempat = fourth kelima = fifth keenam = sixth ketujuh = seventh kedelapan = eighth kesembilan = ninth ke-48 = 48th However! The ke- forms (i.e., not 'pertama') can also function as total versions of cardinal numbers ('both', 'all three' etc.). If the numeral precedes the noun, it is a total cardinal; if it follows the noun, it is an ordinal. An exception is when the modified noun is 'kali' = 'time'. Then the numeral is ordinal regardless where it occurs, and together with 'kali' it functions as an adverbial ordinal ('for the second time'). """ # We could also check the XPOS, which is derived from MorphInd: re.match(r'^CO-', node.xpos) if re.match(r'^pertama(nya)?$', node.form, re.IGNORECASE): node.upos = 'ADJ' node.feats['NumType'] = 'Ord' if re.match(r'^(det|nummod|nmod)$', node.udeprel): node.deprel = 'amod' elif re.match(r'^(kedua|ketiga|keempat|kelima|keenam|ketujuh|kedelapan|kesembilan|ke-?\d+)(nya)?$', node.form, re.IGNORECASE): if node.parent.ord < node.ord or node.parent.lemma == 'kali': node.upos = 'ADJ' node.feats['NumType'] = 'Ord' if re.match(r'^(det|nummod|nmod)$', node.udeprel): node.deprel = 'amod' else: node.upos = 'NUM' node.feats['NumType'] = 'Card' node.feats['PronType'] = 'Tot' if re.match(r'^(det|amod|nmod)$', node.udeprel): node.deprel = 'nummod'
[docs] def rejoin_ordinal_numerals(self, node): """ If an ordinal numeral is spelled using digits ('ke-18'), it is often tokenized as multiple tokens, which is wrong. Fix it. """ if node.form.lower() == 'ke': dash = None number = None if node.next_node: if node.next_node.form == '-': dash = node.next_node if dash.next_node and re.match(r'^\d+$', dash.next_node.form): number = dash.next_node node.form = node.form + dash.form + number.form node.lemma = node.lemma + dash.lemma + number.lemma elif re.match(r'^\d+$', node.next_node.form) and (node.parent == node.next_node or node.next_node.parent == node): number = node.next_node node.feats['Typo'] = 'Yes' node.misc['CorrectForm'] = node.form + '-' + number.form node.form = node.form + number.form node.lemma = node.lemma + '-' + number.lemma if number: # Let us pretend that these forms are always ordinal numerals. # Situations where they act as total cardinals will be disambiguated # in a subsequent call to fix_ordinal_numerals(). node.upos = 'ADJ' node.xpos = 'CO-' node.feats['NumType'] = 'Ord' node.misc['MorphInd'] = '^ke<r>_R--+' + number.form + '<c>_CC-$' # Find the parent node. Assume that the dash, if present, was not the head. if node.parent == number: node.parent = number.parent node.deprel = number.deprel if re.match(r'(case|mark|det|nummod|nmod)', node.udeprel): node.deprel = 'amod' # Adjust SpaceAfter. node.misc['SpaceAfter'] = 'No' if number.no_space_after else '' # Remove the separate node of the dash and the number. if dash: if len(dash.children) > 0: for c in dash.children: c.parent = node dash.remove() if len(number.children) > 0: for c in number.children: c.parent = node number.remove() # There may have been spaces around the dash, which are now gone. Recompute the sentence text. node.root.text = node.root.compute_text()
[docs] def rejoin_decades(self, node): """ In Indonesian, the equivalent of English "1990s" is written as "1990-an". In GSD, it is often tokenized as multiple tokens, which is wrong. Fix it. """ if node.form.lower() == 'an': dash = None number = None if node.prev_node: if node.prev_node.form == '-': dash = node.prev_node if dash.prev_node and re.match(r'^\d+$', dash.prev_node.form): number = dash.prev_node node.form = number.form + dash.form + node.form node.lemma = number.lemma + dash.lemma + node.lemma elif re.match(r'^\d+$', node.prev_node.form) and (node.parent == node.prev_node or node.prev_node.parent == node): number = node.prev_node node.feats['Typo'] = 'Yes' node.misc['CorrectForm'] = number.form + '-' + node.form node.form = number.form + node.form node.lemma = number.lemma + '-' + node.lemma if number: # The combined token is no longer a numeral. It cannot quantify an entity. # Instead, it is itself something like a noun (or perhaps proper noun). node.upos = 'NOUN' node.xpos = 'NSD' node.feats['NumType'] = '' # In some cases, "-an" is labeled as foreign for no obvious reason. node.feats['Foreign'] = '' node.misc['MorphInd'] = '^' + number.form + '<c>_CC-+an<f>_F--$' # Find the parent node. Assume that the dash, if present, was not the head. if node.parent == number: node.parent = number.parent node.deprel = number.deprel if re.match(r'(case|mark|det|nummod|nmod)', node.udeprel): node.deprel = 'nmod' # No need to adjust SpaceAfter, as the 'an' node was the last one in the complex. #node.misc['SpaceAfter'] = 'No' if number.no_space_after else '' # Remove the separate node of the dash and the number. if dash: if len(dash.children) > 0: for c in dash.children: c.parent = node dash.remove() if len(number.children) > 0: for c in number.children: c.parent = node number.remove() # There may have been spaces around the dash, which are now gone. Recompute the sentence text. node.root.text = node.root.compute_text()
[docs] def merge_reduplication(self, node): """ Reduplication is a common morphological device in Indonesian. Reduplicated nouns signal plural but some reduplications also encode emphasis, modification of meaning etc. In the previous annotation of GSD, reduplication was mostly analyzed as three tokens, e.g., for plurals, the second copy would be attached to the first one as compound:plur, and the hyphen would be attached to the second copy as punct. We want to analyze reduplication as a single token. Fix it. """ # We assume that the previous token is a hyphen and the token before it is the parent. first = node.parent root = node.root # Example of identical reduplication: negara-negara = countries # Example of reduplication with -an: kopi-kopian = various coffee trees # Example of reduplication with vowel substitution: bolak-balik = alternating # Example of reduplication with di-: disebut-sebut = mentioned (the verb sebut is reduplicated, then passivized) # Example of reduplication with se-: sehari-hari = daily (hari = day) # The last pattern is not reduplication but we handle it here because the procedure is very similar: non-/sub-/anti- + a word. if first.ord == node.ord-2 and (first.form.lower() == node.form.lower() or first.form.lower() + 'an' == node.form.lower() or re.match(r'^(.)o(.)a(.)-\1a\2i\3$', first.form.lower() + '-' + node.form.lower()) or first.form.lower() == 'di' + node.form.lower() or first.form.lower() == 'se' + node.form.lower() or re.match(r'^(non|sub|anti|multi|kontra)$', first.form.lower())): hyph = node.prev_node if hyph.is_descendant_of(first) and re.match(r'^(-|–|--)$', hyph.form): # This is specific to the reduplicated plurals. The rest will be done for any reduplications. # Note that not all reduplicated plurals had compound:plur. So we will look at whether they are NOUN. ###!!! Also, reduplicated plural nouns always have exact copies on both sides of the hyphen. ###!!! Some other reduplications have slight modifications on one or the other side. if node.upos == 'NOUN' and first.form.lower() == node.form.lower(): first.feats['Number'] = 'Plur' # For the non-/sub-/anti- prefix we want to take the morphology from the second word. if re.match(r'^(non|sub|anti|multi|kontra)$', first.form.lower()): first.lemma = first.lemma + '-' + node.lemma first.upos = node.upos first.xpos = node.xpos first.feats = node.feats first.misc['MorphInd'] = re.sub(r'\$\+\^', '+', first.misc['MorphInd'] + '+' + node.misc['MorphInd']) # Neither the hyphen nor the current node should have children. # If they do, re-attach the children to the first node. for c in hyph.children: c.parent = first for c in node.children: c.parent = first # Merge the three nodes. # It is possible that the last token of the original annotation # is included in a multi-word token. Then we must extend the # multi-word token to the whole reduplication! Example: # pemeran-pemerannya (the actors) ... originally 'pemeran' and '-' # are tokens, 'pemerannya' is a MWT split to 'pemeran' and 'nya'. mwt = node.multiword_token if mwt: # We assume that the MWT has only two words. We are not prepared for other possibilities. if len(mwt.words) > 2: logging.critical('MWT of only two words is expected') mwtmisc = mwt.misc.copy() second = mwt.words[1] mwt.remove() first.form = first.form + '-' + node.form hyph.remove() node.remove() first.misc['SpaceAfter'] = '' mwt = root.create_multiword_token([first, second], first.form + second.form, mwtmisc) else: first.form = first.form + '-' + node.form if node.no_space_after: first.misc['SpaceAfter'] = 'No' else: first.misc['SpaceAfter'] = '' hyph.remove() node.remove() # We cannot be sure whether the original annotation correctly said that there are no spaces around the hyphen. # If it did not, then we have a mismatch with the sentence text, which we must fix. # The following will also fix cases where there was an n-dash ('–') instead of a hyphen ('-'). root.text = root.compute_text() # In some cases the non-/sub-/anti- prefix is annotated as the head of the phrase and the above pattern does not catch it. elif first.ord == node.ord+2 and re.match(r'^(non|sub|anti|multi|kontra)$', node.form.lower()): prefix = node stem = first # here it is not the first part at all hyph = stem.prev_node if hyph.is_descendant_of(first) and re.match(r'^(-|–|--)$', hyph.form): # For the non-/sub-/anti- prefix we want to take the morphology from the second word. stem.lemma = prefix.lemma + '-' + stem.lemma stem.misc['MorphInd'] = re.sub(r'\$\+\^', '+', prefix.misc['MorphInd'] + '+' + stem.misc['MorphInd']) # Neither the hyphen nor the prefix should have children. # If they do, re-attach the children to the stem. for c in hyph.children: c.parent = stem for c in prefix.children: c.parent = stem # Merge the three nodes. # It is possible that the last token of the original annotation # is included in a multi-word token. Then we must extend the # multi-word token to the whole reduplication! Example: # pemeran-pemerannya (the actors) ... originally 'pemeran' and '-' # are tokens, 'pemerannya' is a MWT split to 'pemeran' and 'nya'. mwt = stem.multiword_token if mwt: # We assume that the MWT has only two words. We are not prepared for other possibilities. if len(mwt.words) > 2: logging.critical('MWT of only two words is expected') mwtmisc = mwt.misc.copy() second = mwt.words[1] mwt.remove() stem.form = prefix.form + '-' + stem.form prefix.remove() hyph.remove() stem.misc['SpaceAfter'] = '' mwt = root.create_multiword_token([stem, second], stem.form + second.form, mwtmisc) else: stem.form = prefix.form + '-' + stem.form prefix.remove() hyph.remove() # We cannot be sure whether the original annotation correctly said that there are no spaces around the hyphen. # If it did not, then we have a mismatch with the sentence text, which we must fix. # The following will also fix cases where there was an n-dash ('–') instead of a hyphen ('-'). root.text = root.compute_text()
[docs] def fix_plural_propn(self, node): """ It is unlikely that a proper noun will have a plural form in Indonesian. All examples observed in GSD should actually be tagged as common nouns. """ if node.upos == 'PROPN' and node.feats['Number'] == 'Plur': node.upos = 'NOUN' node.lemma = node.lemma.lower() if node.upos == 'PROPN': node.feats['Number'] = ''
[docs] def fix_satu_satunya(self, node): """ 'satu' = 'one' (NUM) 'satu-satunya' = 'the only' """ root = node.root if node.form == 'nya' and node.parent.form.lower() == 'satu' and node.parent.udeprel == 'fixed' and node.parent.parent.form.lower() == 'satu': satu0 = node.parent.parent satu1 = node.parent nya = node dash = None if satu1.ord == satu0.ord+2 and satu1.prev_node.form == '-': dash = satu1.prev_node satu0.misc['SpaceAfter'] = 'No' dash.misc['SpaceAfter'] = 'No' root.text = root.compute_text() satu1.deprel = 'compound:redup' nya.parent = satu0 # We actually cannot leave the 'compound:redup' here because it is not used in Indonesian. if node.form == 'nya' and node.parent.form.lower() == 'satu': satu0 = node.parent nya = node if satu0.next_node.form == '-': dash = satu0.next_node if dash.next_node.form.lower() == 'satu': satu1 = dash.next_node if satu1.ord == node.ord-1: # Merge satu0 + dash + satu1 into one node. satu0.form = satu0.form + dash.form + satu1.form dash.remove() satu1.remove() # There should be a multi-word token comprising satu1 + nya. mwt = nya.multiword_token if mwt: mwtmisc = mwt.misc.copy() mwt.remove() mwt = root.create_multiword_token([satu0, nya], satu0.form + nya.form, mwtmisc) satu0.misc['SpaceAfter'] = '' root.text = root.compute_text() if node.multiword_token and node.no_space_after: node.misc['SpaceAfter'] = ''
[docs] def lemmatize_from_morphind(self, node): # The MISC column contains the output of MorphInd for the current word. # The analysis has been interpreted wrongly for some verbs, so we need # to re-interpret it and extract the correct lemma. morphind = node.misc['MorphInd'] if node.upos == 'VERB': if morphind: # Remove the start and end tags from morphind. morphind = re.sub(r"^\^", "", morphind) morphind = re.sub(r"\$$", "", morphind) # Remove the final XPOS tag from morphind. morphind = re.sub(r"_V[SP][AP]$", "", morphind) # Split morphind to prefix, stem, and suffix. morphemes = re.split(r"\+", morphind) # Expected suffixes are -kan, -i, -an, or no suffix at all. # There is also the circumfix ke-...-an which seems to be nominalized adjective: # "sama" = "same, similar"; "kesamaan" = "similarity", lemma is "sama"; # but I am not sure what is the reason that these are tagged VERB. if len(morphemes) > 1 and re.match(r"^(kan|i|an(_NSD)?)$", morphemes[-1]): del morphemes[-1] # Expected prefixes are meN-, di-, ber-, peN-, ke-, ter-, se-, or no prefix at all. # There can be two prefixes in a row, e.g., "ber+ke+", or "ter+peN+". while len(morphemes) > 1 and re.match(r"^(meN|di|ber|peN|ke|ter|se|per)$", morphemes[0]): del morphemes[0] # Check that we are left with just one morpheme. if len(morphemes) != 1: logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) else: lemma = morphemes[0] # Remove the stem POS category. lemma = re.sub(r"<[a-z]+>(_.*)?$", "", lemma) node.lemma = lemma else: logging.warning("No MorphInd analysis found for form '%s'" % (node.form)) elif node.upos == 'NOUN': if morphind: # Remove the start and end tags from morphind. morphind = re.sub(r"^\^", "", morphind) morphind = re.sub(r"\$$", "", morphind) # Remove the final XPOS tag from morphind. morphind = re.sub(r'_(N[SP]D|VSA)$', '', morphind) # Do not proceed if there is an unexpected final XPOS tag. if not re.search(r'_[A-Z][-A-Z][-A-Z]$', morphind): # Split morphind to prefix, stem, and suffix. morphemes = re.split(r'\+', morphind) # Expected prefixes are peN-, per-, ke-, ber-. # Expected suffix is -an. if len(morphemes) > 1 and re.match(r'^an$', morphemes[-1]): del morphemes[-1] if len(morphemes) > 1 and re.match(r'^(peN|per|ke|ber)$', morphemes[0]): del morphemes[0] # Check that we are left with just one morpheme. if len(morphemes) != 1: logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) else: lemma = morphemes[0] # Remove the stem POS category. lemma = re.sub(r'<[a-z]+>', '', lemma) node.lemma = lemma elif node.upos == 'ADJ': if morphind: # Remove the start and end tags from morphind. morphind = re.sub(r"^\^", "", morphind) morphind = re.sub(r"\$$", "", morphind) # Remove the final XPOS tag from morphind. morphind = re.sub(r'_ASS$', '', morphind) # Do not proceed if there is an unexpected final XPOS tag. if not re.search(r'_[A-Z][-A-Z][-A-Z]$', morphind): # Split morphind to prefix, stem, and suffix. morphemes = re.split(r'\+', morphind) # Expected prefix is ter-. if len(morphemes) > 1 and re.match(r'^ter$', morphemes[0]): del morphemes[0] # Check that we are left with just one morpheme. if len(morphemes) != 1: logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) else: lemma = morphemes[0] # Remove the stem POS category. lemma = re.sub(r'<[a-z]+>', '', lemma) node.lemma = lemma else: logging.warning("No MorphInd analysis found for form '%s'" % (node.form))
[docs] def process_node(self, node): self.fix_plural_propn(node) self.fix_upos_based_on_morphind(node) self.fix_semua(node) self.rejoin_ordinal_numerals(node) self.fix_ordinal_numerals(node) self.rejoin_decades(node) self.merge_reduplication(node) self.fix_satu_satunya(node) self.lemmatize_from_morphind(node)