Source code for mygrad.nnet.activations.softmax

from typing import Optional, Tuple, Union

import numpy as np

from mygrad.math._special import logsumexp as _logsumexp
from mygrad.operation_base import Operation
from mygrad.tensor_base import Tensor
from mygrad.typing import ArrayLike


def _softmax(x, kwargs):
    if x.ndim > 0 and x.size > 0:
        x = x - x.max(**kwargs)
        target = x.astype(float) if issubclass(x.dtype.type, np.integer) else x

        target = np.exp(x, out=target)
        target /= target.sum(**kwargs)
    else:
        target = x.astype(float) if issubclass(x.dtype.type, np.integer) else x
        target = np.ones_like(target)
    return target


class Softmax(Operation):
    def __call__(self, a, axis=-1):
        self.variables = (a,)
        x = a.data

        self._kw = dict(axis=axis, keepdims=True)
        self._cached_output = _softmax(x, self._kw)
        return self._cached_output

    def backward_var(self, grad, index, **kwargs):
        _ = self.variables[index]  # check index error
        soft = self._cached_output
        sg = soft * grad
        return sg - soft * np.sum(sg, **self._kw)


[docs]def softmax( x: ArrayLike, axis: Union[None, int, Tuple[int, ...]] = -1, *, constant: Optional[bool] = None, ) -> Tensor: r""" Applies the softmax activation function:: f(x) = exp(x) / sum( exp(x) ) Computes the softmax over one or more axes of an ND-tensor. Parameters ---------- x : array_like axis : Union[None, int, Tuple[int, ...]], optional (default=-1) The axis/axes over which to compute the softmax. By default, the softmax is computed over the trailing axis. constant : bool, optional(default=False) If ``True``, the returned tensor is a constant (it does not back-propagate a gradient) Returns ------- mygrad.Tensor Notes ----- - :math:`N` is the number of samples in the batch. - :math:`C` is the number of possible classes for which scores are provided. This implements a numerically-stable version of softmax, however log-softmax is still the more numerically stable activation function. Given the shape-:math:`(N, C)` tensor of scores, ``x``, the softmax classification probabilities are computed. That is, the score for class-:math:`k` of a given datum (:math:`s_{k}`) is normalized using the 'softmax' transformation: .. math:: p_{k} = \frac{e^{s_k}}{\sum_{i=1}^{C}{e^{s_i}}} Examples -------- >>> import mygrad as mg >>> from mygrad.nnet import softmax >>> x = mg.Tensor([[ 2., 2., 2.], ... [2E50, 2E50, 1E50]]) >>> softmax(x) Tensor([[0.33333333, 0.33333333, 0.33333333], [0.5 , 0.5 , 0. ]]) """ return Tensor._op(Softmax, x, op_kwargs=dict(axis=axis), constant=constant)
class LogSoftmax(Operation): scalar_only = True def __call__(self, a, axis=-1): self.variables = (a,) x = a.data self._kw = dict(axis=axis, keepdims=True) return x - _logsumexp(x, **self._kw) def backward_var(self, grad, index, **kwargs): a = self.variables[index] x = a.data soft = _softmax(x, self._kw) return grad - soft * np.sum(grad, **self._kw)
[docs]def logsoftmax( x: ArrayLike, axis: Union[None, int, Tuple[int, ...]] = -1, *, constant: Optional[bool] = None, ) -> Tensor: r""" Applies the log-softmax activation function:: f(x) = log ( exp(x) / sum( exp(x) ) ) Computes the log-softmax over one or more axes of an ND-tensor. Parameters ---------- x : ArrayLike axis : Union[None, int, Tuple[int, ...]], optional (default=-1) The axis/axes over which to compute the log-softmax. By default, the log-softmax is computed over the trailing axis. constant : constant : Optional[bool] If ``True``, the returned tensor is a constant (it does not back-propagate a gradient) Returns ------- log_softmax : mygrad.Tensor Tensor with same shape as ``x`` Notes ----- - :math:`N` is the number of samples in the batch. - :math:`C` is the number of possible classes for which scores are provided. This implements a numerically-stable version of log-softmax, compared to the naive implementation using ``mygrad.log``, ``mygrad.exp``, and ``mygrad.sum``. Given the shape-:math:`(N, C)` tensor of scores, ``x``, the softmax classification probabilities are computed. That is, the score for class-:math:`k` of a given datum (:math:`s_{k}`) is normalized using the 'softmax' transformation: .. math:: p_{k} = \log{\frac{e^{s_k}}{\sum_{i=1}^{C}{e^{s_i}}}} Examples -------- >>> import mygrad as mg >>> from mygrad.nnet import logsoftmax >>> x = mg.Tensor([[ 2., 2., 2.], ... [2E50, 2E50, 1E50]]) >>> logsoftmax(x) Tensor([[-1.09861229e+00, -1.09861229e+00, -1.09861229e+00], [ 0.00000000e+00, 0.00000000e+00, -1.00000000e+50]]) """ return Tensor._op(LogSoftmax, x, op_kwargs=dict(axis=axis), constant=constant)