Source code for udapi.block.util.normalize

"""util.Normalize normalizes the ordering of various attributes in CoNLL-U."""
from udapi.core.block import Block

[docs] class Normalize(Block): """Normalize the ordering of attributes in the FEATS and MISC columns. The attribute-value pairs in the FEATS column in CoNLL-U files must be sorted alphabetically (case-insensitive) according to the guidelines (https://universaldependencies.org/format.html#morphological-annotation). The same is highly recommended for the MISC column. It is useful e.g. for comparing two conllu files with diff. Udapi does the sorting automatically, but for speed reasons only when writing into these attributes. This block thus just forces deserialization of node.feats and node.misc, so that the Udapi later sorts the attributes during serialization. It is a bit more efficient than something like util.Eval node='node.feats["Number"] = node.feats["Number"]' or util.Eval node='node.misc["NonExistentAttribute"] = None' """ def __init__(self, feats=True, misc=True, sent_id=False, empty_node_ord=False, start_sent_id=1, sent_id_prefix="", **kwargs): """ Args: `feats`: normalize the ordering of FEATS. Default=True. `misc`: normalize the ordering of MISC. Default=True. `sent_id`: normalize sent_id so it forms a sequence of integers. Default=False. `empty_node_ord`: normalize ord attributes of empty nodes. Default=False. `start_sent_id`: the first sent_id number `sent_id_prefix`: a string to be prepended before the integer sent_id. Default=empty string. """ super().__init__(**kwargs) self.feats = feats self.misc = misc self.sent_id = sent_id self.empty_node_ord = empty_node_ord self.next_sent_id = start_sent_id self.sent_id_prefix = sent_id_prefix if sent_id_prefix or start_sent_id != 1: self.sent_id = True # TODO: normalize also the order of standardized comments like text, sent_id,...
[docs] def process_bundle(self, bundle): if self.sent_id: bundle.bundle_id = self.sent_id_prefix + str(self.next_sent_id) self.next_sent_id += 1 for tree in bundle: if self._should_process_tree(tree): self.process_tree(tree)
[docs] def process_tree(self, tree): if self.empty_node_ord: node_ord, empty_ord = 0, 0 for node in tree.descendants_and_empty: if node.is_empty(): empty_ord += 1 old_empty_ord, new_empty_ord = str(node.ord), f"{node_ord}.{empty_ord}" if old_empty_ord != new_empty_ord: # Make sure all nodes in this sentence have deserialized enhanced deps. for n in tree.descendants_and_empty: n.deps node.ord = new_empty_ord else: empty_ord = 0 node_ord = node.ord for node in tree.descendants: self.process_node(node)
[docs] def process_node(self, node): if self.feats: node.feats._deserialize_if_empty() node.feats._string = None if self.misc: node.misc._deserialize_if_empty() node.misc._string = None