Source code for udapi.block.ud.id.addmwt

"""
Block ud.id.AddMwt cuts the clitic "-nya" in Indonesian (preprocessed with
MorphInd whose output is stored in MISC attribute MorphInd).
"""
import udapi.block.ud.addmwt
import logging
import re

[docs] class AddMwt(udapi.block.ud.addmwt.AddMwt): """Detect and mark MWTs (split them into words and add the words to the tree)."""
[docs] def multiword_analysis(self, node): """Return a dict with MWT info or None if `node` does not represent a multiword token.""" if re.search(r'^(ku|kau)', node.form, re.IGNORECASE) and re.search(r'^\^(aku<p>_PS1|kamu<p>_PS2)\+', node.misc['MorphInd']) and node.upos == 'VERB': splitform = re.sub(r'^(ku|kau)', r'\1 ', node.form, flags=re.IGNORECASE) # The verb with -nya typically has Number[psor]=Sing|Person[psor]=3. # Remove these features from the verb and give the pronoun normal features Number=Sing|Person=3. node.feats['Number[psor]'] = '' node.feats['Person[psor]'] = '' upos = 'PRON VERB' if re.search(r'^ku ', splitform.lower()): lemma = re.sub(r'^ku ', 'aku ', splitform.lower()) feats = 'Number=Sing|Person=1|PronType=Prs *' xpos = re.sub(r'\+', ' ', node.xpos) if len(xpos.split())<2: xpos = 'PS1 VSA' else: lemma = re.sub(r'^kau ', 'kamu ', splitform.lower()) feats = 'Number=Sing|Person=2|PronType=Prs *' xpos = re.sub(r'\+', ' ', node.xpos) if len(xpos.split())<2: xpos = 'PS2 VSA' deprel = 'nsubj *' return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'main': 1, 'shape': 'subtree', 'deprel': deprel} elif re.search(r'(nya|ku|mu)$', node.form, re.IGNORECASE) and re.search(r'\+(dia<p>_PS3|aku<p>_PS1|kamu<p>_PS2)\$$', node.misc['MorphInd']): if node.upos == 'VERB': splitform = re.sub(r'(nya|ku|mu)$', r' \1', node.form, flags=re.IGNORECASE) # For transitive verbs with the meN- prefix, -nya is an object clitic. # For passive verbs with the di- prefix, -nya refers to a passive agent. # For verbs with prefixes ber-, ter-, and verbs without prefixes, -nya is a definite article and signals nominalization. # The same would hold for intransitive verbs with the meN- prefix but we cannot recognize them (we will treat all meN- verbs as transitive). menverb = True if re.match(r'^\^meN\+', node.misc['MorphInd']) else False diverb = True if re.match(r'^\^di\+', node.misc['MorphInd']) else False nominalization = not menverb and not diverb # The verb with -nya typically has Number[psor]=Sing|Person[psor]=3. # Remove these features from the verb and give the pronoun normal features Number=Sing|Person=3. node.feats['Number[psor]'] = '' node.feats['Person[psor]'] = '' if nominalization: lemma = splitform.lower() upos = 'VERB DET' feats = '* Definite=Def|PronType=Art' deprel = '* det' else: upos = 'VERB PRON' if re.search(r' nya$', splitform.lower()): lemma = re.sub(r' nya$', ' dia', splitform.lower()) feats = '* Number=Sing|Person=3|PronType=Prs' elif re.search(r' ku$', splitform.lower()): lemma = re.sub(r' ku$', ' aku', splitform.lower()) feats = '* Number=Sing|Person=1|PronType=Prs' else: lemma = re.sub(r' mu$', ' kamu', splitform.lower()) feats = '* Number=Sing|Person=2|PronType=Prs' # The agent of the passive verb is coded like a direct object of an active verb, # so we might want to use obj:agent rather than obl:agent. However, full nominals # as passive agents can be optionally accompanied by the preposition _oleh_ "by", # which is an argument in favor of saying that they are oblique. So we currently # mark all passive agents as obliques, although it is disputable in Austronesian # languages (unlike Indo-European passives). deprel = '* obl:agent' if diverb else '* obj' xpos = re.sub(r'\+', ' ', node.xpos) # 'main': 0 ... this is the default value (the first node will be the head and inherit children) return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} elif re.match(r'(NOUN|PROPN|X)', node.upos): splitform = re.sub(r'(nya|ku|mu)$', r' \1', node.form, flags=re.IGNORECASE) # The noun with -nya typically has Number[psor]=Sing|Person[psor]=3. # Remove these features from the noun and give the pronoun normal features Number=Sing|Person=3. node.feats['Number[psor]'] = '' node.feats['Person[psor]'] = '' upos = '* PRON' if re.search(r' nya$', splitform.lower()): lemma = re.sub(r' nya$', ' dia', splitform.lower()) feats = '* Number=Sing|Person=3|PronType=Prs' elif re.search(r' ku$', splitform.lower()): lemma = re.sub(r' ku$', ' aku', splitform.lower()) feats = '* Number=Sing|Person=1|PronType=Prs' else: lemma = re.sub(r' mu$', ' kamu', splitform.lower()) feats = '* Number=Sing|Person=2|PronType=Prs' xpos = re.sub(r'\+', ' ', node.xpos) deprel = '* nmod:poss' # 'main': 0 ... this is the default value (the first node will be the head and inherit children) return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} elif node.upos == 'PRON' and re.match(r'^diri(nya|ku|mu)$', node.form, re.IGNORECASE): # dirinya = reflexive himself/herself/itself (similarly, diriku = myself, dirimu = yourself; somewhere else we should check that they have the right features) splitform = re.sub(r'(nya|ku|mu)$', r' \1', node.form, flags=re.IGNORECASE) # The noun with -nya typically has Number[psor]=Sing|Person[psor]=3. # Remove these features from the noun and give the pronoun normal features Number=Sing|Person=3. node.feats['Number[psor]'] = '' node.feats['Person[psor]'] = '' upos = 'PRON PRON' if re.search(r' nya$', splitform.lower()): lemma = re.sub(r' nya$', ' dia', splitform.lower()) feats = 'PronType=Prs|Reflex=Yes Number=Sing|Person=3|PronType=Prs' xpos = 'NSD PS3' elif re.search(r' ku$', splitform.lower()): lemma = re.sub(r' ku$', ' aku', splitform.lower()) feats = 'PronType=Prs|Reflex=Yes Number=Sing|Person=1|PronType=Prs' xpos = 'NSD PS1' else: lemma = re.sub(r' mu$', ' kamu', splitform.lower()) feats = 'PronType=Prs|Reflex=Yes Number=Sing|Person=2|PronType=Prs' xpos = 'NSD PS2' deprel = '* nmod:poss' # 'main': 0 ... this is the default value (the first node will be the head and inherit children) return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} elif node.upos == 'ADJ' and re.search(r'(nya)$', node.form, re.IGNORECASE): # nominalized adjective splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) lemma = splitform.lower() upos = 'ADJ DET' feats = '* Definite=Def|PronType=Art' if re.match(r' ', node.xpos): xpos = re.sub(r'\+', ' ', node.xpos) else: xpos = 'ASP PS3' deprel = '* det' # 'main': 0 ... this is the default value (the first node will be the head and inherit children) return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} elif re.match(r'^(banyak|semua)nya$', node.form, re.IGNORECASE): # semua = all (DET) # semuanya = nominalization of semua, i.e., 'everything' (PRON) # banyak = many, much (DET) # banyaknya = nominalization of banyak, i.e., 'a lot' (PRON) splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) lemma = splitform.lower() upos = 'DET DET' feats = ('PronType=Tot' if lemma == 'semua nya' else 'PronType=Ind')+' Definite=Def|PronType=Art' xpos = re.sub(r'\+', ' ', node.xpos) deprel = '* det' # 'main': 0 ... this is the default value (the first node will be the head and inherit children) return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} elif re.match(r'^(satu)nya$', node.form, re.IGNORECASE): # satu = one (NUM) # satunya = nominalization of satu, meaning 'the only one' splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) lemma = splitform.lower() upos = 'NUM DET' feats = 'NumType=Card Definite=Def|PronType=Art' xpos = re.sub(r'\+', ' ', node.xpos) deprel = '* det' # 'main': 0 ... this is the default value (the first node will be the head and inherit children) return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} elif node.upos == 'ADP' and re.match(r'^R--\+PS[123]$', node.xpos) or re.match(r'^(bersama|dibawah|didalam|sekitar)nya$', node.form, re.IGNORECASE): # Fused preposition and pronoun. # Most of them are recognized as R--+PS3 by MorphInd. However, some are different: # bersamanya = 'with him' = VSA+PS3 # dibawahnya = 'under it' = VSP+PS3 # didalamnya = 'inside it' = VSP+PS3 # sekitarnya = 'around it' = D--+PS3 # However: # layaknya = 'like' is a derivation from 'layak' = 'worthy' (ASP+PS3) splitform = re.sub(r'(nya|ku|mu)$', r' \1', node.form, flags=re.IGNORECASE) upos = 'ADP PRON' if re.search(r' nya$', splitform.lower()): lemma = re.sub(r' nya$', ' dia', splitform.lower()) feats = '* Number=Sing|Person=3|PronType=Prs' xpos = 'R-- PS3' elif re.search(r' ku$', splitform.lower()): lemma = re.sub(r' ku$', ' aku', splitform.lower()) feats = '* Number=Sing|Person=1|PronType=Prs' xpos = 'R-- PS1' else: lemma = re.sub(r' mu$', ' kamu', splitform.lower()) feats = '* Number=Sing|Person=2|PronType=Prs' xpos = 'R-- PS2' if node.udeprel == 'case': if re.match(r'^(NOUN|PROPN|PRON|DET|NUM|X|SYM)$', node.parent.upos): deprel = 'nmod' else: deprel = 'obl' else: deprel = '*' deprel = 'case '+deprel return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'main': 1, 'shape': 'subtree', 'deprel': deprel} else: # Do not warn about instances that are known exceptions. # akibatnya = as a result (SCONJ); akibat = result # bukannya = instead (PART); bukan = no, not # layaknya = like (ADP); layak = worthy # sebaiknya = should (AUX) # sesampainya = once in / arriving at (ADP) # tidaknya = whether or not (PART); tidak = no, not # Adverbs are an exception, too. The -nya morpheme could be derivation. E.g., 'ironis' = 'ironic'; 'ironisnya' = 'ironically'. if node.upos != 'ADV' and not re.match(r'^(akibat|bukan|layak|sebaik|sesampai|tidak)(nya|ku|mu)$', node.form, re.IGNORECASE): logging.warning("Form '%s' analyzed by MorphInd as having the -nya|-ku|-mu clitic but the UPOS is '%s' and XPOS is '%s'" % (node.form, node.upos, node.xpos)) return None elif re.search(r'(kah|lah|pun|tah)$', node.form, re.IGNORECASE) and re.search(r'\+(kah|lah|pun|tah)<t>_T--\$$', node.misc['MorphInd']): splitform = re.sub(r'(kah|lah|pun|tah)$', r' \1', node.form, flags=re.IGNORECASE) lemma = splitform.lower() upos = '* PART' feats = '* _' xpos = re.sub(r'\+', ' ', node.xpos) if len(xpos.split()) < 2: xpos = xpos + ' T--' deprel = '* advmod:emph' # 'main': 0 ... this is the default value (the first node will be the head and inherit children) return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} return None
[docs] def postprocess_mwt(self, mwt): """Distribute the MorphInd analysis to the two parts so that we can later use it to fix the lemmas of verbs.""" match = re.match(r'^\^(.*)\+(aku<p>_PS1|kamu<p>_PS2|dia<p>_PS3|kah<t>_T--|lah<t>_T--|pun<t>_T--|tah<t>_T--)\$$', mwt.misc['MorphInd']) if not match: match = re.match(r'^\^(aku<p>_PS1|kamu<p>_PS2)\+(.*)\$$', mwt.misc['MorphInd']) if match: mwt.words[0].misc['MorphInd'] = '^'+match.group(1)+'$' mwt.words[1].misc['MorphInd'] = '^'+match.group(2)+'$'