Source code for udapi.block.util.wc

"""Wc is a special block for printing statistics (word count etc)."""
from udapi.core.block import Block


[docs] class Wc(Block): """Special block for printing statistics (word count etc).""" def __init__(self, tsv=False, **kwargs): """Create the Wc block object. Params: tsv: print just tab-separated-values (trees, words, tokens, MWTs, empty nodes) """ super().__init__(**kwargs) self.trees, self.words, self.mwts, self.tokens, self.empty = 0, 0, 0, 0, 0 self.docs, self.paragraphs = 0, 0 self.tsv = tsv
[docs] def process_tree(self, tree): self.trees += 1 self.words += len(tree.descendants) mwtoks = len(tree.multiword_tokens) self.mwts += mwtoks self.tokens += len(tree.token_descendants) if mwtoks else len(tree.descendants) self.empty += len(tree.empty_nodes) if tree.newdoc or tree == tree.document[0].trees[0]: self.docs += 1 if tree.newpar: self.paragraphs += 1
[docs] def process_end(self): if self.tsv: print('\t'.join(map(str, (self.trees, self.words, self.tokens, self.mwts, self.empty, self.docs, self.paragraphs)))) else: print('%8d trees\n%8d words' % (self.trees, self.words)) if self.mwts: print('%8d multi-word tokens\n%8d tokens' % (self.mwts, self.tokens)) if self.empty: print('%8d empty nodes' % self.empty) if self.docs: print('%8d documents' % self.docs) if self.paragraphs: print('%8d paragraphs' % self.paragraphs)