Source code for udapi.block.zellig_harris.baseline

from udapi.core.block import Block


def _merge_deprel(deprel):
    """
    Provide a merging of the closely related bags.
    In fact, simply modify deprel name according to (Vulic et al., 2016).

    :param deprel: An original deprel.
    :return: A modified deprel.
    :rtype: str

    """
    if deprel in ['dobj', 'iobj', ]:
        return 'obj'

    if deprel in ['nsubj', 'nsubjpass']:
        return 'subj'

    if deprel in ['xcomp', 'ccomp']:
        return 'comp'

    if deprel in ['advcl', 'advmod']:
        return 'adv'

    return deprel


[docs] class Baseline(Block): """ A block for extraction context configurations for training verb representations using word2vecf. """ def __init__(self, args=None): """ Initialization. :param args: A dict of optional parameters. """ super(Baseline, self).__init__(args) if args is None: args = {} self.pool = ['prep', 'acl', 'obj', 'comp', 'adv', 'conj'] if 'pool' in args: self.pool = args['pool'].split(',') self.pos = ['VERB'] if 'pos' in args: self.pos = args['pos'].split(',') self.lemmas = False if 'lemmas' in args and args['lemmas'] == '1': self.lemmas = True self.suffixed_forms = False if 'suffixed_form' in args and args['suffixed_forms'] == '1': self.suffixed_forms = True self.reflexive_verbs = False if 'reflexive_verbs' in args and args['reflexive_verbs'] == '1': self.reflexive_verbs = True
[docs] def get_word(self, node): """ Format the correct string representation of the given node according to the block settings. :param node: A input node. :return: A node's string representation. """ # If reflexive pronoun should be append to the verb, try to find such # pronoun for each verb. word_suffix = '' if self.reflexive_verbs: for child in node.children: if child.deprel == 'expl': word_suffix = child.lemma break # Use the node's form or the lemma. word = node.form if self.lemmas: word = node.lemma # Append the word suffix, if found. if word_suffix != '': word = '%s_%s' % (word, word_suffix) # Convert to lowercase. word = word.lower() # Remove last 3 chars when the block is applied on a suffixed dataset. if self.suffixed_forms: word = word[:-3] return word
[docs] def print_triple(self, target_node, context_node, relation_name): """ Print to the standard output the context triple according to the block settings. :param target_node: A target word. :param context_node: A context word. :param relation_name: A relation name. """ target_word = self.get_word(target_node) context_word = self.get_word(context_node) triple = '%s %s_%s' % (target_word, context_word, relation_name) print(triple.encode('utf-8'))
[docs] def process_node(self, node): """ Extract context configuration for verbs according to (Vulic et al., 2016). :param node: A node to be process. """ # We want to extract contexts only for verbs. if str(node.upos) not in self.pos: return # Process node's parent. parent_deprel_orig = node.deprel parent_deprel_merged = _merge_deprel(parent_deprel_orig) if parent_deprel_orig in self.pool: self.print_triple(node, node.parent, parent_deprel_orig) if parent_deprel_orig != parent_deprel_merged and parent_deprel_merged in self.pool: relation_name = '%sI' % parent_deprel_merged self.print_triple(node, node.parent, relation_name) if parent_deprel_orig in self.pool and parent_deprel_orig == 'conj': self.print_triple(node, node.parent, parent_deprel_merged) # Process node's children. for child in node.children: child_deprel_orig = child.deprel child_deprel_merged = _merge_deprel(child_deprel_orig) if child_deprel_orig in self.pool: self.print_triple(node, child, child_deprel_orig) if child_deprel_orig != child_deprel_merged and child_deprel_merged in self.pool: self.print_triple(node, child, child_deprel_merged) if 'prep' in self.pool: has_preposition = False for sub_child in child.children: if sub_child.deprel == 'case': has_preposition = True break if has_preposition: self.print_triple(node, child, 'prep')