Source code for udapi.block.corefud.stats

from udapi.core.block import Block
from collections import Counter
import re


[docs]
class Stats(Block):
    """Block corefud.Stats prints various coreference-related statistics."""

    def __init__(self, m_len_max=5, e_len_max=5,
                 report_basics=False, report_mentions=True, report_entities=True,
                 report_details=True, report_words_per_doc=False, report_entity_range=False,
                 report_docs=True, report_empty_nodes=True,
                 selected_upos='NOUN PRON PROPN DET ADJ VERB ADV NUM _',
                 exclude_singletons=False, exclude_nonsingletons=False, style='human',
                 per_doc=False, max_rows_per_page=50, docname='newdoc', docname_len=15,
                 highlight_docnames=None,
                 **kwargs):
        super().__init__(**kwargs)
        self.m_len_max = m_len_max
        self.e_len_max = e_len_max
        self.report_basics = report_basics
        self.report_mentions = report_mentions
        self.report_entities = report_entities
        self.report_details = report_details
        self.report_words_per_doc = report_words_per_doc
        self.report_entity_range = report_entity_range
        self.report_docs = report_docs
        self.report_empty_nodes = report_empty_nodes
        self.exclude_singletons = exclude_singletons
        self.exclude_nonsingletons = exclude_nonsingletons
        self.style = style
        if style not in 'tex tex-table tex-doc human'.split():
            raise ValueError(f'Unknown style {style}')
        self.per_doc = per_doc
        self.max_rows_per_page = max_rows_per_page
        if docname not in 'newdoc filename'.split():
            raise ValueError(f'Unknown style {style}')
        self.docname = docname
        self.docname_len = docname_len
        self.highlight_docnames = highlight_docnames
        self._header_printed = False
        self._lines_printed = None

        self.counter = Counter()
        self.mentions = 0
        self.entities = 0
        self.singletons = 0
        self.total_nodes = 0
        self.longest_mention = 0
        self.longest_entity = 0
        self.m_words = 0
        self.selected_upos = None if selected_upos == 'all' else selected_upos.split()
        self.entity_ranges = []


[docs]
    def process_document(self, doc):
        self.total_nodes += len(list(doc.nodes))
        self.counter['documents'] += 1
        node2docord, current_docord = {}, 0
        if self.report_entity_range:
            for node in doc.nodes_and_empty:
                node2docord[node] = current_docord
                current_docord += 1

        for entity in doc.coref_entities:
            len_mentions = len(entity.mentions)
            if len_mentions == 1:
                self.singletons += 1
            if len_mentions == 1 and self.exclude_singletons:
                continue
            elif len_mentions > 1 and self.exclude_nonsingletons:
                continue
            if self.report_entity_range:
                self.entity_ranges.append(node2docord[entity.mentions[-1].head] - node2docord[entity.mentions[0].head])
            self.longest_entity = max(len_mentions, self.longest_entity)
            self.counter['c_total_len'] += len_mentions
            self.counter[f"c_len_{min(len_mentions, self.e_len_max)}"] += 1

            self.entities += 1
            if not self.report_mentions and not self.report_details:
                continue
            for mention in entity.mentions:
                self.mentions += 1
                all_words = len(mention.words)
                non_empty = len([w for w in mention.words if not w.is_empty()])
                self.m_words += all_words
                self.longest_mention = max(non_empty, self.longest_mention)
                self.counter['m_total_len'] += non_empty
                self.counter[f"m_len_{min(non_empty, self.m_len_max)}"] += 1
                if self.report_details:
                    upos = 'other'
                    if not self.selected_upos or mention.head.upos in self.selected_upos:
                        upos = mention.head.upos
                    self.counter['m_head_upos_' + upos] += 1
                    self.counter['m_with_empty'] += 1 if all_words > non_empty else 0
                    self.counter['m_with_gaps'] += 1 if ',' in mention.span else 0
                    heads, mwords = 0, set(mention.words)
                    for w in mention.words:
                        if w.parent:
                            heads += 0 if w.parent in mwords else 1
                        else:
                            heads += 0 if any(d['parent'] in mwords for d in w.deps) else 1
                    self.counter['m_nontreelet'] += 1 if heads > 1 else 0

        if self.report_basics:
            doc_words = 0
            for tree in doc.trees:
                self.counter['sents'] += 1
                self.counter['words'] += len(tree.descendants)
                self.counter['empty'] += len(tree.empty_nodes)
                if tree.newdoc:
                    self.counter['newdocs'] += 1
                    if doc_words > self.counter['max_words_per_doc']:
                        self.counter['max_words_per_doc'] = doc_words
                    doc_words = 0
                doc_words += len(tree.descendants)



[docs]
    def after_process_document(self, doc):
        if self.per_doc:
            self.process_end(skip=False, doc=doc)
            self.counter = Counter()
            self.mentions = 0
            self.entities = 0
            self.singletons = 0
            self.total_nodes = 0
            self.longest_mention = 0
            self.longest_entity = 0
            self.m_words = 0
            self.entity_ranges = []



[docs]
    def process_end(self, skip=True, doc=None):
        if not self._lines_printed:
            self.print_header()
            self._lines_printed = 0
        if self.per_doc:
            if skip:
                self.print_footer()
                return
            else:
                docname = doc.meta['loaded_from'] if self.docname == 'filename' else doc[0].trees[0].newdoc
                if self.style.startswith('tex'):
                    if self.highlight_docnames and re.search(self.highlight_docnames, docname):
                        docname = r"\NEW " + docname
                    docname = docname.replace('_', r'\_')
                print(f"{docname:{self.docname_len}}", end='&' if self.style.startswith('tex') else '\n')
        elif self.style.startswith('tex-'):
            print(f"{self.counter['documents']:4} documents &")
        self._lines_printed += 1

        mentions_nonzero = 1 if self.mentions == 0 else self.mentions
        entities_nonzero = 1 if self.entities == 0 else self.entities
        total_nodes_nonzero = 1 if self.total_nodes == 0 else self.total_nodes

        columns =[ ]
        if self.report_basics:
            if self.report_docs:
                columns += [('docs', f"{self.counter['newdocs']:6,}"),]
            columns += [('sents', f"{self.counter['sents']:7,}"),
                        ('words', f"{self.counter['words']:9,}"),]
            if self.report_empty_nodes:
                columns += [('empty', f"{self.counter['empty']:7,}"),]
            if self.report_words_per_doc:
                columns += [('max_words/doc', f"{self.counter['max_words_per_doc']:7,}"),
                            ('words/doc', f"{self.counter['words']/self.counter['newdocs']:7,.0f}"),]
        if self.report_entities:
            columns += [('entities', f"{self.entities:7,}"),
                        ('entities_per1k', f"{1000 * self.entities / total_nodes_nonzero:6.0f}"),
                        ('longest_entity', f"{self.longest_entity:6}"),
                        ('avg_entity', f"{self.counter['c_total_len'] / entities_nonzero:5.1f}")]
            if self.report_entity_range:
                self.entity_ranges.sort()
                percentile = self.entity_ranges[int(0.95 * (len(self.entity_ranges) - 1))] if self.entity_ranges else 0
                columns += [('entity_range_95percentile', f"{percentile:6,}"),]
            for i in range(1, self.e_len_max + 1):
                percent = 100 * self.counter[f"c_len_{i}"] / entities_nonzero
                columns.append((f"c_len_{i}{'' if i < self.e_len_max else '+'}", f"{percent:5.1f}"))
        if self.report_mentions:
            columns += [('mentions', f"{self.mentions:7,}"),
                        ('mentions_per1k', f"{1000 * self.mentions / total_nodes_nonzero:6.0f}"),
                        ('longest_mention', f"{self.longest_mention:6}"),
                        ('avg_mention', f"{self.counter['m_total_len'] / mentions_nonzero:5.1f}")]
            if self.m_len_max:
                for i in range(0, self.m_len_max + 1):
                    percent = 100 * self.counter[f"m_len_{i}"] / mentions_nonzero
                    columns.append((f"m_len_{i}{'' if i < self.m_len_max else '+'}", f"{percent:5.1f}"))
        if self.report_details:
            columns += [('with_empty', f"{100 * self.counter['m_with_empty'] / mentions_nonzero:5.1f}"),
                        ('with_gaps', f"{100 * self.counter['m_with_gaps'] / mentions_nonzero:5.1f}"),
                        ('nontreelet', f"{100 * self.counter['m_nontreelet'] / mentions_nonzero:5.1f}"),]
            if self.selected_upos:
                upos_list = self.selected_upos + ['other']
            else:
                upos_list = [x[12:] for x in self.counter if x.startswith('m_head_upos_')]
            for upos in upos_list:
                columns.append(('head_upos=' + upos, f"{100 * self.counter['m_head_upos_' + upos] / mentions_nonzero:5.1f}"))

        if self.style.startswith('tex'):
            print(" &".join(c[1] for c in columns), end=" \\\\\n")
        elif self.style == 'human':
            for c in columns:
                print(f"{c[0]:>15} = {c[1].strip():>10}")
        if not self.per_doc:
            self.print_footer()
        elif self._lines_printed > self.max_rows_per_page:
            self.print_footer(False)
            self._lines_printed = 0



[docs]
    def print_header(self):
        if not self.style.startswith('tex-'):
            return
        if self.style == 'tex-doc':
            if self._lines_printed is None:
                print(r'\documentclass[multi=mypage]{standalone}')
                print(r'\usepackage[utf8]{inputenc}\usepackage{booktabs}\usepackage{underscore}')
                print(r'\usepackage[table]{xcolor}\newcommand{\NEW}{\rowcolor{gray!50}}')
                print(r'\title{Udapi coreference statistics}')
                print(r'\begin{document}')
        print(r'\def\MC#1#2{\multicolumn{#1}{c}{#2}}')
        lines = [r'\begin{mypage}'+"\n"+r'\begin{tabular}{@{}l ',
                 " " * self.docname_len,
                 ("document" if self.per_doc else "dataset ") + " " * (self.docname_len-8),
                 " " * self.docname_len]
        if self.report_basics:
            basic_cols = []
            if self.report_docs:
                basic_cols.append('docs')
            basic_cols.extend(['sents', 'words'])
            if self.report_empty_nodes:
                basic_cols.append('empty n.')

            lines[0] += "r" * len(basic_cols) + " "
            lines[1] += r'& \MC{' + str(len(basic_cols)) + r'}{text size}                  '
            lines[2] += r'& \MC{' + str(len(basic_cols)) + r'}{total number of}            '
            lines[3] += '&' + '&'.join(f"{label:>7}" for label in basic_cols)
            if self.report_words_per_doc:
                lines[0] += "rr "
                lines[1] += r'&        &        '
                lines[2] += r'&\MC{2}{words/doc}'
                lines[3] += r'&    max &    avg '
        if self.report_entities:
            lines[0] += "rrrr "
            lines[1] += r'& \MC{4}{entities}              '
            lines[2] += r'&  total &per 1k &\MC{2}{length}'
            lines[3] += r'&  count & words &   max &  avg '
            if self.report_entity_range:
                lines[0] += "r "
                lines[1] += r'&       '
                lines[2] += r'& range '
                lines[3] += r'& p95   '
            if self.e_len_max:
                for i in range(1, self.e_len_max + 1):
                    lines[0] += "r"
                    lines[2] += f"& {i:4}" + ("+ " if i==self.e_len_max else "  ")
                    lines[3] += r'&  [\%] '
                lines[0] += " "
                lines[1] += r'& \MC{' + str(self.e_len_max) + r'}{distribution of entity lengths}'
        if self.report_mentions:
            lines[0] += "rrrr "
            lines[1] += r'& \MC{4}{mentions}              '
            lines[2] += r'&  total &per 1k &\MC{2}{length}'
            lines[3] += r'&  count & words &   max &  avg '
            if self.m_len_max:
                for i in range(0, self.m_len_max + 1):
                    lines[0] += "r"
                    lines[2] += f"& {i:4}" + ("+ " if i==self.m_len_max else "  ")
                    lines[3] += r'&  [\%] '
                lines[0] += " "
                lines[1] += r'& \MC{' + str(self.m_len_max + 1) + r'}{distribution of mention lengths}' + " "*7
        if self.report_details:
            lines[0] += "rrrr "
            lines[1] += r'& \MC{3}{mention type}       '
            lines[2] += r'&w/empty& w/gap&non-tree'
            lines[3] += r'&  [\%] ' * 3
            if self.selected_upos:
                upos_list = self.selected_upos + ['other']
            else:
                upos_list = [x[12:] for x in self.counter if x.startswith('m_head_upos_')]
            lines[0] += "@{~}r" * len(upos_list)
            lines[1] += r"& \MC{" + str(len(upos_list)) + r"}{distribution of head UPOS}"
            lines[2] += ''.join(f'&{upos:7}' for upos in upos_list)
            lines[3] += r'&  [\%] ' * len(upos_list)
        lines[0] += r'@{}}\toprule'
        last_col = 1
        lines[1] += r'\\'
        lines[2] += r'\\'
        lines[3] += r'\\\midrule'
        if self.report_basics:
            basics_count = (1 if self.report_docs else 0) + 2 + (1 if self.report_empty_nodes else 0)
            lines[1] += r'\cmidrule(lr){2-' + str(1 + basics_count + (2 if self.report_words_per_doc else 0)) + '}'
            lines[2] += r'\cmidrule(lr){2-' + str(1 + basics_count) + '}'
            last_col += basics_count
            if self.report_words_per_doc:
                lines[2] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+2}" + '}'
                last_col += 2
        if self.report_entities:
            _cols = 5 if self.report_entity_range else 4
            lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+_cols}" + '}'
            lines[2] += r'\cmidrule(lr){' + f"{last_col+3}-{last_col+4}" + '}'
            last_col += _cols
            if self.e_len_max:
                last_col += self.e_len_max
                lines[1] += r'\cmidrule(lr){6-' + str(last_col) + '}'
        if self.report_mentions:
            lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+4}" + '}'
            lines[2] += r'\cmidrule(lr){' + f"{last_col+3}-{last_col+4}" + '}'
            last_col += 4
            if self.m_len_max:
                lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+self.m_len_max+1}" + '}'
                last_col += self.m_len_max + 1
        if self.report_details:
            lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+3}"
            lines[1] += r'}\cmidrule(l){' + f"{last_col+4}-{last_col+3+len(upos_list)}" + '}'
        print("\n".join(lines))



[docs]
    def print_footer(self, end_doc=True):
        if not self.style.startswith('tex-'):
            return
        print(r'\bottomrule\end{tabular}'+"\n"+r'\end{mypage}')
        if self.style == 'tex-doc' and end_doc:
            print(r'\end{document}')