Source code for udapi.core.mwt

"""MWT class represents a multi-word token."""
from udapi.core.dualdict import DualDict
from udapi.core.feats import Feats

[docs] class MWT(object): """Class for representing multi-word tokens in UD trees.""" __slots__ = ['words', 'form', '_feats', '_misc', 'root'] def __init__(self, words=None, form=None, feats=None, misc=None, root=None): self.words = words if words is not None else [] self.form = form self._feats = Feats(feats) if feats and feats != '_' else None self._misc = DualDict(misc) if misc and misc != '_' else None self.root = root for word in self.words: word._mwt = self # pylint: disable=W0212 @property def feats(self): """Property `feats` in MWT should be used only for `Typo=Yes`. See https://universaldependencies.org/changes.html#typos-in-multiword-tokens However, Udapi does not enforce this restriction and mwt.feats works exactly the same as node.feats. """ if self._feats is None: self._feats = Feats() return self._feats @feats.setter def feats(self, value): if self._feats is None: self._feats = Feats(value) else: self._feats.set_mapping(value) @property def misc(self): """Property for MISC attributes stored as a `DualDict` object. See `udapi.core.node.Node` for details. """ if self._misc is None: self._misc = DualDict() return self._misc @misc.setter def misc(self, value): if self._misc is None: self._misc = DualDict(value) else: self._misc.set_mapping(value) @property def ord_range(self): """Return a string suitable for the first column of CoNLL-U.""" self.words.sort() return "%d-%d" % (self.words[0].ord, self.words[-1].ord)
[docs] def remove(self): """Delete this multi-word token (but keep its words).""" for word in self.words: word._mwt = None # pylint: disable=W0212 self.root.multiword_tokens.remove(self)
[docs] def address(self): """Full (document-wide) id of the multi-word token.""" return self.root.address + '#' + self.ord_range
[docs] @staticmethod def is_mwt(): """Is this a multi-word token? Returns always True. False is returned only by instances of the Node class. """ return True
@property def no_space_after(self): """Boolean property as a shortcut for `mwt.misc["SpaceAfter"] == "No"`.""" return self.misc["SpaceAfter"] == "No"
[docs] @staticmethod def is_empty(): """Is this an Empty node? Returns always False because multi-word tokens cannot be empty nodes. """ return False
[docs] @staticmethod def is_leaf(): """Is this a node/mwt without any children? Returns always True because multi-word tokens cannot have children. """ return True
def _get_attr(self, name): # pylint: disable=too-many-return-statements if name == 'form': return self.form if name == 'ord': return self.ord_range if name in ('edge', 'children', 'siblings', 'depth'): return 0 if name == 'feats_split': return str(self.feats).split('|') if name == 'misc_split': return str(self.misc).split('|') if name.startswith('feats['): return self.feats[name[6:-1]] if name.startswith('misc['): return self.misc[name[5:-1]] return '<mwt>'
[docs] def get_attrs(self, attrs, undefs=None, stringify=True): """Return multiple attributes or pseudo-attributes, possibly substituting empty ones. MWTs do not have children nor parents nor prev/next nodes, so the pseudo-attributes: p_xy, c_xy, l_xy and r_xy are irrelevant (and return nothing). Other pseudo-attributes (e.g. dir) return always the string "<mwt>". The only relevant pseudo-attributes are feats_split and misc_split: a list of name=value formatted strings. The `ord` attribute returns actually `mwt.ord_range`. Args: attrs: A list of attribute names, e.g. ``['form', 'ord', 'feats_split']``. undefs: A value to be used instead of None for empty (undefined) values. stringify: Apply `str()` on each value (except for None) """ values = [] for name in attrs: nodes = [self] if name[1] == '_': nodes, name = [], name[2:] for node in (n for n in nodes if n is not None): if name in {'feats_split', 'misc_split'}: values.extend(node._get_attr(name)) else: values.append(node._get_attr(name)) if undefs is not None: values = [x if x is not None else undefs for x in values] if stringify: values = [str(x) if x is not None else None for x in values] return values
@property def _ord(self): self.words.sort() return self.words[0]._ord
# TODO: node.remove() should check if the node is not part of any MWT # TODO: Document that editing words by mwt.words.append(node), del or remove(node) is not supported # TODO: Make mwt._words private and provide a setter # TODO: What to do when mwt.words = []? (It is allowed after mwt=MWT().) # TODO: words.setter and node.shift* should check if the MWT does not contain gaps # and is still multi-word # TODO: Make sure mwt.words are always sorted (even after node.shift*). # TODO: Check if one word is not included in multiple multi-word tokens.