Source code for udapi.block.ud.removemwt

"""Block ud.RemoveMwt for removing multi-word tokens."""
from udapi.core.block import Block


[docs] class RemoveMwt(Block): """Substitute MWTs with one word representing the whole MWT."""
[docs] def process_tree(self, root): for mwt in root.multiword_tokens: words = mwt.words words[0].form = mwt.form words[0].misc = mwt.misc words[0].upos = self.guess_upos(words) words[0].feats = self.guess_feats(words) words[0].deprel = self.guess_deprel(words) mwt.remove() for word in words[1:]: word.remove(children='rehang')
[docs] @staticmethod def guess_upos(words): """UPOS of the whole MWT""" return words[0].upos
[docs] @staticmethod def guess_deprel(words): """DEPREL of the whole MWT""" return words[0].deprel
# Alternatively, we could define deprel subtypes # return words[0].deprel + ':' + ','.join([w.deprel for w in words[1:]])
[docs] @staticmethod def guess_feats(words): """FEATS of the whole MWT""" feats = words[0].feats for word in words[1:]: feats.update(word.feats) return feats