Source code for udapi.block.corefud.stats
from udapi.core.block import Block
from collections import Counter
[docs]
class Stats(Block):
"""Block corefud.Stats prints various coreference-related statistics."""
def __init__(self, m_len_max=5, e_len_max=5, report_mentions=True, report_entities=True,
report_details=True, selected_upos='NOUN PRON PROPN DET ADJ VERB ADV NUM',
exclude_singletons=False, exclude_nonsingletons=False, style='human',
per_doc=False, max_rows_per_page=50, **kwargs):
super().__init__(**kwargs)
self.m_len_max = m_len_max
self.e_len_max = e_len_max
self.report_mentions = report_mentions
self.report_entities = report_entities
self.report_details = report_details
self.exclude_singletons = exclude_singletons
self.exclude_nonsingletons = exclude_nonsingletons
self.style = style
if style not in 'tex tex-table tex-doc human'.split():
raise ValueError(f'Unknown style {style}')
self.per_doc = per_doc
self.max_rows_per_page = max_rows_per_page
self._header_printed = False
self._lines_printed = None
self.counter = Counter()
self.mentions = 0
self.entities = 0
self.singletons = 0
self.total_nodes = 0
self.longest_mention = 0
self.longest_entity = 0
self.m_words = 0
self.selected_upos = None if selected_upos == 'all' else selected_upos.split()
[docs]
def process_document(self, doc):
self.total_nodes += len(list(doc.nodes))
self.counter['documents'] += 1
for entity in doc.coref_entities:
len_mentions = len(entity.mentions)
if len_mentions == 1:
self.singletons += 1
if len_mentions == 1 and self.exclude_singletons:
continue
elif len_mentions > 1 and self.exclude_nonsingletons:
continue
self.longest_entity = max(len_mentions, self.longest_entity)
self.counter['c_total_len'] += len_mentions
self.counter[f"c_len_{min(len_mentions, self.e_len_max)}"] += 1
self.entities += 1
if not self.report_mentions and not self.report_details:
continue
for mention in entity.mentions:
self.mentions += 1
all_words = len(mention.words)
non_empty = len([w for w in mention.words if not w.is_empty()])
self.m_words += all_words
self.longest_mention = max(non_empty, self.longest_mention)
self.counter['m_total_len'] += non_empty
self.counter[f"m_len_{min(non_empty, self.m_len_max)}"] += 1
if self.report_details:
upos = 'other'
if not self.selected_upos or mention.head.upos in self.selected_upos:
upos = mention.head.upos
self.counter['m_head_upos_' + upos] += 1
self.counter['m_with_empty'] += 1 if all_words > non_empty else 0
self.counter['m_with_gaps'] += 1 if ',' in mention.span else 0
heads, mwords = 0, set(mention.words)
for w in mention.words:
if w.parent:
heads += 0 if w.parent in mwords else 1
else:
heads += 0 if any(d['parent'] in mwords for d in w.deps) else 1
self.counter['m_nontreelet'] += 1 if heads > 1 else 0
[docs]
def after_process_document(self, doc):
if self.per_doc:
self.process_end(skip=False, doc=doc)
self.counter = Counter()
self.mentions = 0
self.entities = 0
self.singletons = 0
self.total_nodes = 0
self.longest_mention = 0
self.longest_entity = 0
self.m_words = 0
[docs]
def process_end(self, skip=True, doc=None):
if not self._lines_printed:
self.print_header()
self._lines_printed = 0
if self.per_doc:
if skip:
self.print_footer()
return
else:
print(f"{doc[0].trees[0].newdoc:15}", end='&' if self.style.startswith('tex') else '\n')
elif self.style.startswith('tex-'):
print(f"{self.counter['documents']:4} documents &")
self._lines_printed += 1
mentions_nonzero = 1 if self.mentions == 0 else self.mentions
entities_nonzero = 1 if self.entities == 0 else self.entities
total_nodes_nonzero = 1 if self.total_nodes == 0 else self.total_nodes
columns =[ ]
if self.report_entities:
columns += [('entities', f"{self.entities:7,}"),
('entities_per1k', f"{1000 * self.entities / total_nodes_nonzero:6.0f}"),
('longest_entity', f"{self.longest_entity:6}"),
('avg_entity', f"{self.counter['c_total_len'] / entities_nonzero:5.1f}")]
for i in range(1, self.e_len_max + 1):
percent = 100 * self.counter[f"c_len_{i}"] / entities_nonzero
columns.append((f"c_len_{i}{'' if i < self.e_len_max else '+'}", f"{percent:5.1f}"))
if self.report_mentions:
columns += [('mentions', f"{self.mentions:7,}"),
('mentions_per1k', f"{1000 * self.mentions / total_nodes_nonzero:6.0f}"),
('longest_mention', f"{self.longest_mention:6}"),
('avg_mention', f"{self.counter['m_total_len'] / mentions_nonzero:5.1f}")]
if self.m_len_max:
for i in range(0, self.m_len_max + 1):
percent = 100 * self.counter[f"m_len_{i}"] / mentions_nonzero
columns.append((f"m_len_{i}{'' if i < self.m_len_max else '+'}", f"{percent:5.1f}"))
if self.report_details:
columns += [('with_empty', f"{100 * self.counter['m_with_empty'] / mentions_nonzero:5.1f}"),
('with_gaps', f"{100 * self.counter['m_with_gaps'] / mentions_nonzero:5.1f}"),
('nontreelet', f"{100 * self.counter['m_nontreelet'] / mentions_nonzero:5.1f}"),]
if self.selected_upos:
upos_list = self.selected_upos + ['other']
else:
upos_list = [x[12:] for x in self.counter if x.startswith('m_head_upos_')]
for upos in upos_list:
columns.append(('head_upos=' + upos, f"{100 * self.counter['m_head_upos_' + upos] / mentions_nonzero:5.1f}"))
if self.style.startswith('tex'):
print(" & ".join(c[1] for c in columns), end=" \\\\\n")
elif self.style == 'human':
for c in columns:
print(f"{c[0]:>15} = {c[1].strip():>10}")
if not self.per_doc:
self.print_footer()
elif self._lines_printed > self.max_rows_per_page:
self.print_footer(False)
self._lines_printed = 0
[docs]
def print_header(self):
if not self.style.startswith('tex-'):
return
if self.style == 'tex-doc':
if self._lines_printed is None:
print(r'\documentclass[multi=mypage]{standalone}')
print(r'\usepackage[utf8]{inputenc}\usepackage{booktabs}\usepackage{underscore}')
print(r'\title{Udapi coreference statistics}')
print(r'\begin{document}')
print(r'\def\MC#1#2{\multicolumn{#1}{c}{#2}}')
lines = [r'\begin{mypage}\begin{tabular}{@{}l ', " "*15, ("document" if self.per_doc else "dataset ") + " "*7, " "*15]
if self.report_entities:
lines[0] += "rrrr "
lines[1] += r'& \MC{4}{entities} '
lines[2] += r'& total & per 1k & \MC{2}{length} '
lines[3] += r'& count & words & max & avg. '
if self.e_len_max:
for i in range(1, self.e_len_max + 1):
lines[0] += "r"
lines[2] += f"& {i:4}" + ("+ " if i==self.e_len_max else " ")
lines[3] += r'& [\%] '
lines[0] += " "
lines[1] += r'& \MC{' + str(self.e_len_max) + r'}{distribution of entity lengths}'
if self.report_mentions:
lines[0] += "rrrr "
lines[1] += r'& \MC{4}{mentions} '
lines[2] += r'& total & per 1k & \MC{2}{length} '
lines[3] += r'& count & words & max & avg. '
if self.m_len_max:
for i in range(0, self.m_len_max + 1):
lines[0] += "r"
lines[2] += f"& {i:4}" + ("+ " if i==self.m_len_max else " ")
lines[3] += r'& [\%] '
lines[0] += " "
lines[1] += r'& \MC{' + str(self.m_len_max + 1) + r'}{distribution of mention lengths}' + " "*7
if self.report_details:
lines[0] += "rrrr "
lines[1] += r'& \MC{3}{mention type} '
lines[2] += r'&w/empty& w/gap&non-tree'
lines[3] += r'& [\%] ' * 3
if self.selected_upos:
upos_list = self.selected_upos + ['other']
else:
upos_list = [x[12:] for x in self.counter if x.startswith('m_head_upos_')]
lines[0] += "@{~}r" * len(upos_list)
lines[1] += r"& \MC{" + str(len(upos_list)) + r"}{distribution of head UPOS}"
lines[2] += ''.join(f'&{upos:7}' for upos in upos_list)
lines[3] += r'& [\%] ' * len(upos_list)
lines[0] += r'@{}}\toprule'
last_col = 1
lines[1] += r'\\'
lines[2] += r'\\'
lines[3] += r'\\\midrule'
if self.report_entities:
last_col += 4
lines[1] += r'\cmidrule(lr){2-5}'
lines[2] += r'\cmidrule(lr){4-5}'
if self.e_len_max:
last_col += self.e_len_max
lines[1] += r'\cmidrule(lr){6-' + str(last_col) + '}'
if self.report_mentions:
lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+4}" + '}'
lines[2] += r'\cmidrule(lr){' + f"{last_col+3}-{last_col+4}" + '}'
last_col += 4
if self.m_len_max:
lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+self.m_len_max+1}" + '}'
last_col += self.m_len_max + 1
if self.report_details:
lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+3}"
lines[1] += r'}\cmidrule(l){' + f"{last_col+4}-{last_col+3+len(upos_list)}" + '}'
print("\n".join(lines))