Source code for udapi.block.util.normalize

"""util.Normalize normalizes the ordering of various attributes in CoNLL-U."""
from udapi.core.block import Block
from pathlib import Path

[docs] class Normalize(Block): """Normalize the ordering of attributes in the FEATS and MISC columns. The attribute-value pairs in the FEATS column in CoNLL-U files must be sorted alphabetically (case-insensitive) according to the guidelines (https://universaldependencies.org/format.html#morphological-annotation). The same is highly recommended for the MISC column. It is useful e.g. for comparing two conllu files with diff. Udapi does the sorting automatically, but for speed reasons only when writing into these attributes. This block thus just forces deserialization of node.feats and node.misc, so that the Udapi later sorts the attributes during serialization. It is a bit more efficient than something like util.Eval node='node.feats["Number"] = node.feats["Number"]' or util.Eval node='node.misc["NonExistentAttribute"] = None' """ def __init__(self, feats=True, misc=True, sent_id=False, empty_node_ord=False, start_sent_id=1, sent_id_prefix="", sent_id_from_filename=False, sent_id_reset_at_newdoc=False, newdoc_from_filename=False, **kwargs): """ Args: `feats`: normalize the ordering of FEATS. Default=True. `misc`: normalize the ordering of MISC. Default=True. `sent_id`: normalize sent_id so it forms a sequence of integers. Default=False. `empty_node_ord`: normalize ord attributes of empty nodes. Default=False. `start_sent_id`: the first sent_id number `sent_id_prefix`: a string to be prepended before the integer sent_id. Default=empty string. `sent_id_from_filename`: add Path(doc.meta["loaded_from"]).stem before the `sent_id_prefix`. Default=False. `sent_id_reset_at_newdoc`: reset the sent_id counter to 1 for each new document. Default=False. `newdoc_from_filename`: set newdoc to Path(doc.meta["loaded_from"]).stem. Default=False. """ super().__init__(**kwargs) self.feats = feats self.misc = misc self.sent_id = sent_id self.empty_node_ord = empty_node_ord self.next_sent_id = start_sent_id self.sent_id_prefix = sent_id_prefix self.sent_id_from_filename = sent_id_from_filename self.sent_id_reset_at_newdoc = sent_id_reset_at_newdoc self.newdoc_from_filename = newdoc_from_filename if sent_id_reset_at_newdoc and not sent_id_from_filename: raise ValueError("Cannot use sent_id_reset_at_newdoc without sent_id_from_filename") if sent_id_prefix or start_sent_id != 1 or sent_id_from_filename: self.sent_id = True # TODO: normalize also the order of standardized comments like text, sent_id,...
[docs] def process_bundle(self, bundle): is_newdoc = any(tree.newdoc for tree in bundle.trees) if self.newdoc_from_filename and is_newdoc: tree = next(tree for tree in bundle.trees if tree.newdoc) tree.newdoc = Path(bundle.document.meta["loaded_from"]).stem if self.sent_id: if self.sent_id_reset_at_newdoc and is_newdoc: self.next_sent_id = 1 prefix = self.sent_id_prefix if self.sent_id_from_filename: prefix = Path(bundle.document.meta["loaded_from"]).stem + prefix bundle.bundle_id = prefix + str(self.next_sent_id) self.next_sent_id += 1 for tree in bundle: if self._should_process_tree(tree): self.process_tree(tree)
[docs] def process_tree(self, tree): if self.empty_node_ord: node_ord, empty_ord = 0, 0 for node in tree.descendants_and_empty: if node.is_empty(): empty_ord += 1 old_empty_ord, new_empty_ord = str(node.ord), f"{node_ord}.{empty_ord}" if old_empty_ord != new_empty_ord: # Make sure all nodes in this sentence have deserialized enhanced deps. for n in tree.descendants_and_empty: n.deps node.ord = new_empty_ord else: empty_ord = 0 node_ord = node.ord for node in tree.descendants: self.process_node(node)
[docs] def process_node(self, node): if self.feats: node.feats._deserialize_if_empty() node.feats._string = None if self.misc: node.misc._deserialize_if_empty() node.misc._string = None