"""Block class represents the basic Udapi processing unit."""
import logging
import inspect
[docs]
def not_overridden(method):
method.is_not_overridden = True
return method
[docs]
class Block(object):
"""The smallest processing unit for processing Universal Dependencies data.
Parameters:
zones: which zone to process (default="all")
if_empty_tree: what to do when encountering a tree with no nodes.
Possible values are: process (default), skip, skip_warn, fail, delete.
"""
def __init__(self, zones='all', if_empty_tree='process', **kwargs):
self.zones = zones
self.if_empty_tree = if_empty_tree
if kwargs:
params = set()
for cls in type(self).mro()[:-1]:
params.update(inspect.signature(cls.__init__).parameters.keys())
params -= {'self', 'kwargs'}
raise TypeError(f"Extra parameters {kwargs}.\n"
f"Parameters of {self.block_name()} are:\n"
+ '\n'.join(sorted(params)))
[docs]
def block_name(self):
module = ".".join(self.__module__.split(".")[:-1])
if module.startswith('udapi.block.'):
module = module[12:]
return module + "." + self.__class__.__name__
[docs]
def process_start(self):
"""A hook method that is executed before processing UD data"""
pass
[docs]
def process_end(self):
"""A hook method that is executed after processing all UD data"""
pass
[docs]
@not_overridden
def process_node(self, _):
"""Process a UD node"""
pass
[docs]
@not_overridden
def process_empty_node(self, _):
"""Process an empty node (in enhanced dependencies)"""
pass
[docs]
@not_overridden
def process_tree(self, tree):
"""Process a UD tree"""
# tree.descendants is slightly slower than tree._descendants (0.05s per iterating over 700k words),
# but it seems safer to iterate over a copy of the list of nodes.
# If a user calls parent.create_child().shift_before_node(parent) in process_node,
# it may end up in endless cycle (because the same node is processed again - Python for cycle remembers the position).
for node in tree.descendants:
self.process_node(node)
[docs]
@not_overridden
def process_bundle(self, bundle):
"""Process a UD bundle"""
for tree in bundle:
if self._should_process_tree(tree):
self.process_tree(tree)
[docs]
def run(self, document):
self.process_start()
self.apply_on_document(document)
self.process_end()
[docs]
def apply_on_document(self, document):
self.before_process_document(document)
self.process_document(document)
self.after_process_document(document)
[docs]
def process_document(self, document):
"""Process a UD document"""
# Calling document.coref_entities is expensive because
# it needs to deserialize coref_entities from the MISC attributes.
# If no block in a scenario needs to process coreference entities/mentions,
# the deserialization does not need to be done.
# So we need to detect if any of the methods process_coref_entity and process_coref_mention
# has been overriden (without calling them, which could have adverse side effects).
# Let's use method annotations for this.
p_entity = not hasattr(self.process_coref_entity, 'is_not_overridden')
p_mention = not hasattr(self.process_coref_mention, 'is_not_overridden')
p_bundle = not hasattr(self.process_bundle, 'is_not_overridden')
p_tree = not hasattr(self.process_tree, 'is_not_overridden')
p_node = not hasattr(self.process_node, 'is_not_overridden')
p_empty_node = not hasattr(self.process_empty_node, 'is_not_overridden')
if not any((p_entity, p_mention, p_bundle, p_tree, p_node, p_empty_node)):
raise Exception("No processing activity defined in block " + self.block_name())
if p_entity or p_mention:
for entity in document.coref_entities:
if p_entity:
self.process_coref_entity(entity)
else:
for mention in entity.mentions:
self.process_coref_mention(mention)
if p_bundle or p_tree or p_node or p_empty_node:
for bundle_no, bundle in enumerate(document.bundles, 1):
logging.debug(f'Block {self.block_name()} processing '
f'bundle #{bundle_no} (id={bundle.bundle_id})')
if p_bundle:
self.process_bundle(bundle)
else:
for tree in bundle:
if self._should_process_tree(tree):
if p_tree:
self.process_tree(tree)
else:
if p_node:
for node in tree.descendants:
self.process_node(node)
if p_empty_node:
for empty_node in tree.empty_nodes:
self.process_empty_node(empty_node)
[docs]
@not_overridden
def process_coref_entity(self, entity):
"""This method is called on each coreference entity in the document."""
for mention in entity.mentions:
self.process_coref_mention(mention)
[docs]
@not_overridden
def process_coref_mention(self, mention):
"""This method is called on each coreference mention in the document."""
pass
[docs]
def before_process_document(self, document):
"""This method is called before each process_document."""
pass
[docs]
def after_process_document(self, document):
"""This method is called after each process_document."""
pass
def _should_process_tree(self, tree):
if self.if_empty_tree != 'process' and not tree.descendants:
if self.if_empty_tree == 'skip':
return False
elif self.if_empty_tree == 'delete':
tree.remove()
return False
elif self.if_empty_tree == 'skip_warn':
logging.warning("Tree %s is empty", tree)
return False
elif self.if_empty_tree == 'fail':
raise Exception("Tree %s is empty" % tree)
else:
raise ValueError("Unknown value for if_empty_tree: "
+ self.if_empty_tree)
if self.zones == 'all':
return True
if self.zones == '' and tree.zone == '':
return True
if tree.zone in self.zones.split(','):
return True
return False