Source code for udapi.core.mwt

"""MWT class represents a multi-word token."""
from udapi.core.dualdict import DualDict
from udapi.core.feats import Feats


[docs]
class MWT(object):
    """Class for representing multi-word tokens in UD trees."""
    __slots__ = ['words', 'form', '_feats', '_misc', 'root']

    def __init__(self, words=None, form=None, feats=None, misc=None, root=None):
        self.words = words if words is not None else []
        self.form = form
        self._feats = Feats(feats) if feats and feats != '_' else None
        self._misc = DualDict(misc) if misc and misc != '_' else None
        self.root = root
        for word in self.words:
            word._mwt = self  # pylint: disable=W0212

    @property
    def feats(self):
        """Property `feats` in MWT should be used only for `Typo=Yes`.

        See https://universaldependencies.org/changes.html#typos-in-multiword-tokens
        However, Udapi does not enforce this restriction and mwt.feats works exactly the same as node.feats.
        """
        if self._feats is None:
            self._feats = Feats()
        return self._feats

    @feats.setter
    def feats(self, value):
        if self._feats is None:
            self._feats = Feats(value)
        else:
            self._feats.set_mapping(value)

    @property
    def misc(self):
        """Property for MISC attributes stored as a `DualDict` object.

        See `udapi.core.node.Node` for details.
        """
        if self._misc is None:
            self._misc = DualDict()
        return self._misc

    @misc.setter
    def misc(self, value):
        if self._misc is None:
            self._misc = DualDict(value)
        else:
            self._misc.set_mapping(value)

    @property
    def ord_range(self):
        """Return a string suitable for the first column of CoNLL-U."""
        self.words.sort()
        return "%d-%d" % (self.words[0].ord, self.words[-1].ord)


[docs]
    def remove(self):
        """Delete this multi-word token (but keep its words)."""
        for word in self.words:
            word._mwt = None  # pylint: disable=W0212
        self.root.multiword_tokens.remove(self)



[docs]
    def address(self):
        """Full (document-wide) id of the multi-word token."""
        return self.root.address + '#' + self.ord_range



[docs]
    @staticmethod
    def is_mwt():
        """Is this a multi-word token?

        Returns always True.
        False is returned only by instances of the Node class.
        """
        return True


    @property
    def no_space_after(self):
        """Boolean property as a shortcut for `mwt.misc["SpaceAfter"] == "No"`."""
        return self.misc["SpaceAfter"] == "No"


[docs]
    @staticmethod
    def is_empty():
        """Is this an Empty node?

        Returns always False because multi-word tokens cannot be empty nodes.
        """
        return False



[docs]
    @staticmethod
    def is_leaf():
        """Is this a node/mwt without any children?

        Returns always True because multi-word tokens cannot have children.
        """
        return True


    def _get_attr(self, name):  # pylint: disable=too-many-return-statements
        if name == 'form':
            return self.form
        if name == 'ord':
            return self.ord_range
        if name in ('edge', 'children', 'siblings', 'depth'):
            return 0
        if name == 'feats_split':
            return str(self.feats).split('|')
        if name == 'misc_split':
            return str(self.misc).split('|')
        if name.startswith('feats['):
            return self.feats[name[6:-1]]
        if name.startswith('misc['):
            return self.misc[name[5:-1]]
        return '<mwt>'


[docs]
    def get_attrs(self, attrs, undefs=None, stringify=True):
        """Return multiple attributes or pseudo-attributes, possibly substituting empty ones.

        MWTs do not have children nor parents nor prev/next nodes,
        so the pseudo-attributes: p_xy, c_xy, l_xy and r_xy are irrelevant (and return nothing).
        Other pseudo-attributes (e.g. dir) return always the string "<mwt>".
        The only relevant pseudo-attributes are
        feats_split and misc_split: a list of name=value formatted strings.
        The `ord` attribute returns actually `mwt.ord_range`.

        Args:
        attrs: A list of attribute names, e.g. ``['form', 'ord', 'feats_split']``.
        undefs: A value to be used instead of None for empty (undefined) values.
        stringify: Apply `str()` on each value (except for None)
        """
        values = []
        for name in attrs:
            nodes = [self]
            if name[1] == '_':
                nodes, name = [], name[2:]
            for node in (n for n in nodes if n is not None):
                if name in {'feats_split', 'misc_split'}:
                    values.extend(node._get_attr(name))
                else:
                    values.append(node._get_attr(name))

        if undefs is not None:
            values = [x if x is not None else undefs for x in values]
        if stringify:
            values = [str(x) if x is not None else None for x in values]
        return values


    @property
    def _ord(self):
        self.words.sort()
        return self.words[0]._ord


# TODO: node.remove() should check if the node is not part of any MWT
# TODO: Document that editing words by mwt.words.append(node), del or remove(node) is not supported
# TODO: Make mwt._words private and provide a setter
# TODO: What to do when mwt.words = []? (It is allowed after mwt=MWT().)
# TODO: words.setter and node.shift* should check if the MWT does not contain gaps
#       and is still multi-word
# TODO: Make sure mwt.words are always sorted (even after node.shift*).
# TODO: Check if one word is not included in multiple multi-word tokens.