Source code for udapi.block.util.wc

"""Wc is a special block for printing statistics (word count etc)."""
from udapi.core.block import Block



[docs]
class Wc(Block):
    """Special block for printing statistics (word count etc)."""

    def __init__(self, tsv=False, **kwargs):
        """Create the Wc block object.

        Params:
        tsv: print just tab-separated-values (trees, words, tokens, MWTs, empty nodes)
        """
        super().__init__(**kwargs)
        self.trees, self.words, self.mwts, self.tokens, self.empty = 0, 0, 0, 0, 0
        self.docs, self.paragraphs = 0, 0
        self.tsv = tsv


[docs]
    def process_tree(self, tree):
        self.trees += 1
        self.words += len(tree.descendants)
        mwtoks = len(tree.multiword_tokens)
        self.mwts += mwtoks
        self.tokens += len(tree.token_descendants) if mwtoks else len(tree.descendants)
        self.empty += len(tree.empty_nodes)
        if tree.newdoc or tree == tree.document[0].trees[0]:
            self.docs += 1
        if tree.newpar:
            self.paragraphs += 1



[docs]
    def process_end(self):
        if self.tsv:
            print('\t'.join(map(str, (self.trees, self.words, self.tokens, self.mwts, self.empty, self.docs, self.paragraphs))))
        else:
            print('%8d trees\n%8d words' % (self.trees, self.words))
            if self.mwts:
                print('%8d multi-word tokens\n%8d tokens' % (self.mwts, self.tokens))
            if self.empty:
                print('%8d empty nodes' % self.empty)
            if self.docs:
                print('%8d documents' % self.docs)
            if self.paragraphs:
                print('%8d paragraphs' % self.paragraphs)