Source code for udapi.block.ud.printfixed

"""
Block PrintFixed prints occurrences of fixed multiword expressions in UD. It
can be run twice in a row, first collecting known fixed expressions and then
also reporting other occurrences of these expressions where they are not
annotated as fixed.

Usage:
udapy ud.PrintFixed only_forms=1 < in.conllu | sort -u > fixed_expressions.txt
udapy ud.PrintFixed known_expressions=fixed_expressions.txt < in.conllu | sort | uniq -c | less

Author: Dan Zeman
"""
from udapi.core.block import Block
import re
import logging

[docs] class PrintFixed(Block): """ Print fixed multiword expressions. """ def __init__(self, only_forms=False, known_expressions=None, **kwargs): """ Create the PrintFixed block. Parameters: only_forms=1: print the word forms but not tags and other info; This can be used to create the list of known forms that we want to identify even if they are not annotated as fixed. known_expressions: the name of the text file with the expressions """ super().__init__(**kwargs) self.only_forms = only_forms self.known_expressions = {} self.first_words = {} self.max_length = 2 if known_expressions: fh = open(known_expressions, 'r', encoding='utf-8') n = 0 for expression in fh.readlines(): expression = expression.replace('\n', '') if expression in self.known_expressions: self.known_expressions[expression] += 1 else: self.known_expressions[expression] = 1 logging.info("Read known fixed expression '%s'" % expression) n += 1 words = expression.split(' ') first_word = words[0] self.first_words[first_word] = 1 length = len(words) if length > self.max_length: self.max_length = length logging.info('Read %d known fixed expressions.' % n)
[docs] def process_node(self, node): fixed_children = [x for x in node.children if x.udeprel == 'fixed'] if len(fixed_children) > 0: # Fixed children are always to the right of of the parent. But there # may be other nodes in between that are not fixed children (for # example, there may be punctuation that is attached to one of the # fixed nodes). n = node list_of_forms = [node.form.lower()] list_of_tags = [node.upos] while n != fixed_children[-1]: n = n.next_node if n.parent == node and n.udeprel == 'fixed': list_of_forms.append(n.form.lower()) list_of_tags.append(n.upos) else: list_of_forms.append('X') list_of_tags.append('X') forms = ' '.join(list_of_forms) tags = ' '.join(list_of_tags) if self.only_forms: print(forms) else: print("%s / %s / %s" % (forms, tags, node.deprel)) else: # If this is not the first word of a fixed expression, check whether # something that looks like a known fixed expression starts here. # Note that it is also possible that a known expression starts here # but only a subset is actually marked as such; we currently do not # account for this. if node.form.lower() in self.first_words: n = node list_of_forms = [node.form.lower()] list_of_tags = [node.upos] for i in range(self.max_length - 1): n = n.next_node if not n: break ###!!! At present we cannot identify known expressions with gaps ('X'). list_of_forms.append(n.form.lower()) list_of_tags.append(n.upos) forms = ' '.join(list_of_forms) if forms in self.known_expressions: if self.only_forms: print(forms) else: tags = ' '.join(list_of_tags) print("%s / %s / NOT FIXED" % (forms, tags)) break