"""Root class represents the technical root node in each tree."""
import logging
from udapi.core.node import Node, EmptyNode, ListOfNodes
from udapi.core.mwt import MWT
# 7 instance attributes is too low (CoNLL-U has 10 columns)
# The set of public attributes/properties and methods of Root was well-thought.
# pylint: disable=too-many-instance-attributes
[docs]
class Root(Node):
"""Class for representing root nodes (technical roots) in UD trees."""
__slots__ = ['_sent_id', '_zone', '_bundle', '_descendants', '_mwts',
'empty_nodes', 'text', 'comment', 'newpar', 'newdoc', 'json']
# pylint: disable=too-many-arguments
def __init__(self, zone=None, comment='', text=None, newpar=None, newdoc=None):
"""Create new root node."""
# Call constructor of the parent object.
super().__init__(root=self)
self.ord = 0
self.form = '<ROOT>'
self.lemma = '<ROOT>'
self.upos = '<ROOT>'
self.xpos = '<ROOT>'
self.deprel = '<ROOT>'
self.comment = comment
self.text = text
self.newpar = newpar
self.newdoc = newdoc
self.json = {} # TODO: or None and mask as {} in property reader&writer to save memory?
self._sent_id = None
self._zone = zone
self._bundle = None
self._descendants = []
self._mwts = []
self.empty_nodes = [] # TODO: private
@property
def sent_id(self):
"""ID of this tree, stored in the sent_id comment in CoNLL-U."""
if self._sent_id is not None:
return self._sent_id
zone = '/' + self.zone if self.zone else ''
if self._bundle is not None:
self._sent_id = self._bundle.address() + zone
else:
return '?' + zone
return self._sent_id
@sent_id.setter
def sent_id(self, sent_id):
if self._bundle is not None:
parts = sent_id.split('/', 1)
self._bundle.bundle_id = parts[0]
if len(parts) == 2:
self.zone = parts[1]
self._sent_id = sent_id
[docs]
def address(self):
"""Full (document-wide) id of the root.
The general format of root nodes is:
root.bundle.bundle_id + '/' + root.zone, e.g. s123/en_udpipe.
If zone is empty, the slash is excluded as well, e.g. s123.
If bundle is missing (could occur during loading), '?' is used instead.
Root's address is stored in CoNLL-U files as sent_id (in a special comment).
"""
return self.sent_id
@property
def document(self):
return self._bundle._document
@property
def bundle(self):
"""Return the bundle which this tree belongs to."""
return self._bundle
@bundle.setter
def bundle(self, bundle):
self._bundle = bundle
@property
def zone(self):
"""Return zone (string label) of this tree."""
return self._zone
@zone.setter
def zone(self, zone):
"""Specify which zone the root belongs to."""
if self._bundle:
self._bundle.check_zone(zone)
self._zone = zone
slashzone = '/' + zone if zone else ''
if self._bundle is not None:
self._sent_id = self._bundle.address() + slashzone
elif self._sent_id:
self._sent_id = self._sent_id.split('/', 1)[0] + slashzone
else:
self._sent_id = '?' + slashzone
@property
def parent(self):
"""Return dependency parent (head) node.
This root-specific implementation returns always None.
"""
return None
@parent.setter
def parent(self, _):
"""Attempts at setting parent of root result in AttributeError exception."""
raise AttributeError('The technical root cannot have a parent.')
@property
def descendants(self):
"""Return a list of all descendants of the current node.
The nodes are sorted by their ord.
This root-specific implementation returns all the nodes in the tree except the root itself.
"""
return ListOfNodes(self._descendants, origin=self)
[docs]
def is_descendant_of(self, node):
"""Is the current node a descendant of the node given as argument?
This root-specific implementation returns always False.
"""
return False
[docs]
def is_root(self):
"""Return True for all Root instances."""
return True
[docs]
def remove(self, children=None):
"""Remove the whole tree from its bundle.
Args:
children: a string specifying what to do if the root has any children.
The default (None) is to delete them (and all their descendants).
`warn` means to issue a warning.
"""
if children is not None and self.children:
logging.warning('%s is being removed by remove(children=%s), '
' but it has (unexpected) children', self, children)
self.bundle.trees = [root for root in self.bundle.trees if root != self]
[docs]
def shift(self, reference_node, after=0, move_subtree=0, reference_subtree=0):
"""Attempts at changing the word order of root result in Exception."""
raise Exception('Technical root cannot be shifted as it is always the first node')
[docs]
def create_empty_child(self, **kwargs):
"""Create and return a new empty node within this tree.
This root-specific implementation overrides `Node.create_empty_child()'.
It is faster because it does not set `deps` and `ord` of the newly created node.
It is up to the user to set up these attributes correctly.
It is used in `udapi.block.read.conllu` (where speed is important and thus,
only `raw_deps` are set up instead of `deps`).
"""
new_node = EmptyNode(root=self, **kwargs)
self.empty_nodes.append(new_node)
return new_node
# TODO document whether misc is a string or dict or it can be both
[docs]
def create_multiword_token(self, words=None, form=None, misc=None):
"""Create and return a new multi-word token (MWT) in this tree.
The new MWT can be optionally initialized using the following args.
Args:
words: a list of nodes which are part of the new MWT
form: string representing the surface form of the new MWT
misc: misc attribute of the new MWT
"""
# Nested or overlapping MWTs are not allowed in CoNLL-U,
# so first remove all previous MWTs containing any of words.
for w in words:
if w.multiword_token:
w.multiword_token.remove()
# Now, create the new MWT.
mwt = MWT(words, form, misc, root=self)
self._mwts.append(mwt)
if words[-1].misc["SpaceAfter"] == "No":
mwt.misc["SpaceAfter"] = "No"
for word in words:
word.misc["SpaceAfter"] = ""
return mwt
@property
def multiword_tokens(self):
"""Return a list of all multi-word tokens in this tree."""
return self._mwts
# TODO should this setter be part of the public API?
@multiword_tokens.setter
def multiword_tokens(self, mwts):
"""Set the list of all multi-word tokens in this tree."""
self._mwts = mwts
[docs]
def get_sentence(self, if_missing='detokenize'):
"""Return either the stored `root.text` or (if None) `root.compute_text()`.
Args:
if_missing: What to do if `root.text` is `None`? (default=detokenize)
* `detokenize`: use `root.compute_text()` to compute the sentence.
* `empty`: return an empty string
* `warn_detokenize`, `warn_empty`: in addition emit a warning via `logging.warning()`
* `fatal`: raise an exception
"""
sentence = self.text
if sentence is not None:
return sentence
if if_missing == 'fatal':
raise RuntimeError('Tree %s has empty root.text.' % self.address())
if if_missing.startswith('warn'):
logging.warning('Tree %s has empty root.text.', self.address())
if if_missing.endswith('detokenize'):
return self.compute_text()
return ''
@property
def token_descendants(self):
"""Return all tokens (one-word or multi-word) in the tree.
ie. return a list of `core.Node` and `core.MWT` instances,
whose forms create the raw sentence. Skip nodes, which are part of multi-word tokens.
For example with:
1-2 vĂ¡monos _
1 vamos ir
2 nos nosotros
3-4 al _
3 a a
4 el el
5 mar mar
`[n.form for n in root.token_descendants]` will return `['vĂ¡monos', 'al', 'mar']`.
"""
result = []
last_mwt_id = 0
for node in self._descendants:
mwt = node.multiword_token
if mwt:
if node.ord > last_mwt_id:
last_mwt_id = mwt.words[-1].ord
result.append(mwt)
else:
result.append(node)
return result
@property
def descendants_and_empty(self):
return sorted(self._descendants + self.empty_nodes)
[docs]
def steal_nodes(self, nodes):
"""Move nodes from another tree to this tree (append)."""
old_root = nodes[0].root
for node in nodes[1:]:
if node.root != old_root:
raise ValueError("steal_nodes(nodes) was called with nodes from several trees")
nodes = sorted(nodes)
whole_tree = nodes == old_root.descendants
new_ord = len(self._descendants)
# pylint: disable=protected-access
for node in nodes:
new_ord += 1
node.ord = new_ord
node._root = self
if not whole_tree:
for child in [n for n in node.children if n not in nodes]:
child._parent = old_root
old_root._children = sorted(old_root.children + [child])
node._children = [n for n in node.children if n in nodes]
if node.parent == old_root or (not whole_tree and node.parent not in nodes):
node.parent._children = [n for n in node.parent._children if n != node]
node._parent = self
self._children.append(node)
if whole_tree:
old_root._descendants = []
self._mwts += old_root.multiword_tokens
old_root.multiword_tokens = []
else:
old_root._descendants = [n for n in old_root._descendants if n not in nodes]
mwt = node.multiword_token
for node in nodes:
if mwt:
words = [w for w in mwt.words if w in nodes]
mwt.remove()
self.create_multiword_token(words=words, form=mwt.form, misc=mwt.misc)
self._descendants += nodes
# pylint: enable=protected-access
[docs]
def flatten(self, deprel='root'):
"""Flatten the tree (i.e. attach all nodes to the root) and reset all deprels.
This is equivalent to
for node in root.descendants:
node.parent = root
node.deprel = 'root'
but it is faster.
"""
self._children = self._descendants[:]
for node in self._children:
node._parent = self
node._children.clear()