Source code for udapi.block.ud.ru.fixtoest

"""Block to fix annotation of то есть in Russian."""
from udapi.core.block import Block
import logging
import re

[docs] class FixToEst(Block):
[docs] def process_node(self, node): """ In the converted data from Kira, the fixed expression "то есть" ("that is") is treated as a subordinator and attached as "mark", which later makes it part of complex enhanced relation labels. I believe that this analysis is wrong and that it will be better to label these expressions as "cc". """ if node.udeprel == 'mark' and node.lemma == 'то': if len([c for c in node.children if c.udeprel == 'fixed' and c.lemma == 'быть']) > 0: self.set_basic_and_enhanced(node, node.parent, 'cc', 'cc')
[docs] def set_basic_and_enhanced(self, node, parent, deprel, edeprel): ''' Modifies the incoming relation of a node both in the basic tree and in the enhanced graph. If the node does not yet depend in the enhanced graph on the current basic parent, the new relation will be added without removing any old one. If the node already depends multiple times on the current basic parent in the enhanced graph, all such enhanced relations will be removed before adding the new one. ''' old_parent = node.parent node.parent = parent node.deprel = deprel node.deps = [x for x in node.deps if x['parent'] != old_parent] new_edep = {} new_edep['parent'] = parent new_edep['deprel'] = edeprel node.deps.append(new_edep)