Source code for udapi.block.zellig_harris.baseline

from udapi.core.block import Block


def _merge_deprel(deprel):
    """
    Provide a merging of the closely related bags.
    In fact, simply modify deprel name according to (Vulic et al., 2016).

    :param deprel: An original deprel.
    :return: A modified deprel.
    :rtype: str

    """
    if deprel in ['dobj', 'iobj', ]:
        return 'obj'

    if deprel in ['nsubj', 'nsubjpass']:
        return 'subj'

    if deprel in ['xcomp', 'ccomp']:
        return 'comp'

    if deprel in ['advcl', 'advmod']:
        return 'adv'

    return deprel



[docs]
class Baseline(Block):
    """
    A block for extraction context configurations for training verb representations using word2vecf.

    """

    def __init__(self, args=None):
        """
        Initialization.

        :param args: A dict of optional parameters.

        """
        super(Baseline, self).__init__(args)

        if args is None:
            args = {}

        self.pool = ['prep', 'acl', 'obj', 'comp', 'adv', 'conj']
        if 'pool' in args:
            self.pool = args['pool'].split(',')

        self.pos = ['VERB']
        if 'pos' in args:
            self.pos = args['pos'].split(',')

        self.lemmas = False
        if 'lemmas' in args and args['lemmas'] == '1':
            self.lemmas = True

        self.suffixed_forms = False
        if 'suffixed_form' in args and args['suffixed_forms'] == '1':
            self.suffixed_forms = True

        self.reflexive_verbs = False
        if 'reflexive_verbs' in args and args['reflexive_verbs'] == '1':
            self.reflexive_verbs = True


[docs]
    def get_word(self, node):
        """
        Format the correct string representation of the given node according to the block settings.

        :param node: A input node.
        :return: A node's string representation.

        """
        # If reflexive pronoun should be append to the verb, try to find such
        # pronoun for each verb.
        word_suffix = ''
        if self.reflexive_verbs:
            for child in node.children:
                if child.deprel == 'expl':
                    word_suffix = child.lemma
                    break

        # Use the node's form or the lemma.
        word = node.form
        if self.lemmas:
            word = node.lemma

        # Append the word suffix, if found.
        if word_suffix != '':
            word = '%s_%s' % (word, word_suffix)

        # Convert to lowercase.
        word = word.lower()

        # Remove last 3 chars when the block is applied on a suffixed dataset.
        if self.suffixed_forms:
            word = word[:-3]

        return word



[docs]
    def print_triple(self, target_node, context_node, relation_name):
        """
        Print to the standard output the context triple according to the block settings.

        :param target_node: A target word.
        :param context_node: A context word.
        :param relation_name: A relation name.

        """
        target_word = self.get_word(target_node)
        context_word = self.get_word(context_node)

        triple = '%s %s_%s' % (target_word, context_word, relation_name)
        print(triple.encode('utf-8'))



[docs]
    def process_node(self, node):
        """
        Extract context configuration for verbs according to (Vulic et al., 2016).

        :param node: A node to be process.

        """
        # We want to extract contexts only for verbs.
        if str(node.upos) not in self.pos:
            return

        # Process node's parent.
        parent_deprel_orig = node.deprel
        parent_deprel_merged = _merge_deprel(parent_deprel_orig)

        if parent_deprel_orig in self.pool:
            self.print_triple(node, node.parent, parent_deprel_orig)

        if parent_deprel_orig != parent_deprel_merged and parent_deprel_merged in self.pool:
            relation_name = '%sI' % parent_deprel_merged
            self.print_triple(node, node.parent, relation_name)

        if parent_deprel_orig in self.pool and parent_deprel_orig == 'conj':
            self.print_triple(node, node.parent, parent_deprel_merged)

        # Process node's children.
        for child in node.children:
            child_deprel_orig = child.deprel
            child_deprel_merged = _merge_deprel(child_deprel_orig)

            if child_deprel_orig in self.pool:
                self.print_triple(node, child, child_deprel_orig)

            if child_deprel_orig != child_deprel_merged and child_deprel_merged in self.pool:
                self.print_triple(node, child, child_deprel_merged)

            if 'prep' in self.pool:
                has_preposition = False
                for sub_child in child.children:
                    if sub_child.deprel == 'case':
                        has_preposition = True
                        break

                if has_preposition:
                    self.print_triple(node, child, 'prep')