"""Block ud.SplitUnderscoreTokens splits tokens with underscores are attaches them using flat.
Usage:
udapy -s ud.SplitUnderscoreTokens < in.conllu > fixed.conllu
Author: Martin Popel
"""
import logging
from udapi.core.block import Block
[docs]
class SplitUnderscoreTokens(Block):
"""Block for spliting tokens with underscores and attaching the new nodes using deprel=flat.
E.g.::
1 Hillary_Rodham_Clinton Hillary_Rodham_Clinton PROPN xpos 0 dep
is transformed into:
1 Hillary Hillary PROPN xpos 0 dep
2 Rodham Rodham PROPN xpos 1 flat
3 Clinton Clinton PROPN xpos 1 flat
Real-world use cases: UD_Irish (`default_deprel=fixed`) and UD_Czech-CLTT v1.4.
"""
def __init__(self, deprel=None, default_deprel='flat', **kwargs):
"""Create the SplitUnderscoreTokens block instance.
Args:
deprel: Which deprel to always use for the newly created nodes?
Most common values are: flat, fixed, compound. Default=None.
default_deprel: Which deprel to use for the newly created nodes if the heuristics
in `deprel_for()` method fail. Default=flat.
"""
super().__init__(**kwargs)
self.deprel = deprel
self.default_deprel = default_deprel
[docs]
def process_node(self, node):
if node.form != '_' and '_' in node.form:
forms = node.form.split('_')
lemmas = node.lemma.split('_')
if len(forms) != len(lemmas):
logging.warning("Different number of underscores in %s and %s, skipping.",
node.form, node.lemma)
return
last_node = node
deprel = self.deprel_for(node)
for form, lemma in zip(forms[1:], lemmas[1:]):
new_node = node.create_child(form=form, lemma=lemma, upos=node.upos,
xpos=node.xpos, deprel=deprel)
new_node.shift_after_node(last_node)
last_node = new_node
node.form = forms[0]
node.lemma = lemmas[0]
if node.misc['SpaceAfter'] == 'No':
del node.misc['SpaceAfter']
last_node.misc['SpaceAfter'] = 'No'
[docs]
def deprel_for(self, node):
"""Return deprel of the newly created nodes: `flat`, `fixed`, `compound` or its subtypes.
See http://universaldependencies.org/u/dep/flat.html
http://universaldependencies.org/u/dep/fixed.html
http://universaldependencies.org/u/dep/compound.html
Note that unlike the first two, `deprel=compound` does not need to be head-initial.
This method implements a coarse heuristic rules to decide between `fixed` and `flat`.
"""
if self.deprel:
return self.deprel
# Proper names tend to form `flat` constructions.
if node.upos == 'PROPN':
return 'flat'
# Closed-class words (except NUM) tend to form `fixed` constructions.
if node.upos in ('ADP', 'AUX', 'CCONJ', 'DET', 'PART', 'PRON', 'SCONJ'):
return 'fixed'
# The default default :-) is `flat`.
return self.default_deprel