import numpy as np
from scipy.special import softmax


def project_onto_simplex(v):
    """
    Project a vector v onto the simplex with sum equal to s.

    Parameters:
    - v: Input vector to be projected

    Returns:
    - Projected vector onto the simplex
    """
    n = len(v)  # Number of elements in the input vector
    u = np.sort(v)[::-1]  # Sort the elements of v in descending order
    cssv = np.cumsum(u) - 1  # Compute the Cumulative Sum of Sorted Values (CSSV)
    ind = np.arange(1, n + 1)  # Indices from 1 to n
    cond = u - cssv / ind > 0  # Check the condition for each element
    rho = ind[cond][-1]  # Find the index where the condition is satisfied
    theta = cssv[cond][-1] / float(rho)  # Compute theta
    w = np.maximum(v - theta, 0)  # Project v onto the simplex
    return w

def closest_non_singular_matrix(A, epsilon=1e-6):
    """
    Obtain the closest non-singular matrix to a given singular matrix by perturbing its singular values.

    Parameters:
        A (numpy.ndarray): Singular matrix.
        epsilon (float): Small positive value for replacing zero singular values.

    Returns:
        numpy.ndarray: Closest non-singular matrix.
    """
    # Compute singular value decomposition
    U, Sigma, Vt = np.linalg.svd(A, hermitian= True)

    # Replace zero singular values with a small positive value
    Sigma[Sigma < epsilon] = epsilon

    # Reconstruct matrix with modified singular values
    A_prime = U @ np.diag(Sigma) @ Vt

    return A_prime

def make_positive_definite_eig(matrix, epsilon=1e-6):
    """
    Ensure the matrix is positive definite by adjusting its eigenvalues.

    Parameters:
    matrix (np.ndarray): The input symmetric matrix.
    epsilon (float): The small value to add to the eigenvalues if they are non-positive.

    Returns:
    np.ndarray: The adjusted positive definite matrix.
    """
    # Check if the matrix is symmetric
    if not np.allclose(matrix, matrix.T):
        raise ValueError("Matrix is not symmetric")

    # Eigenvalue decomposition
    eigenvalues, eigenvectors = np.linalg.eigh(matrix)

    # Adjust the eigenvalues to be positive
    adjusted_eigenvalues = np.maximum(eigenvalues, epsilon)

    # Reconstruct the matrix
    positive_definite_matrix = (eigenvectors @ np.diag(adjusted_eigenvalues) @ eigenvectors.T)

    return positive_definite_matrix

def log_multivariate_gaussian(X, mean, cov):
    """
    Compute the log-pdf of multivariate Gaussian distribution for multiple instances.

    Parameters:
        X (numpy.ndarray): Input matrix where each row represents an instance.
        mean (numpy.ndarray): Mean vector of the Gaussian distribution.
        cov (numpy.ndarray): Covariance matrix of the Gaussian distribution.

    Returns:
        numpy.ndarray: Log-pdf of the multivariate Gaussian distribution for each instance.
    """
    # Compute the log determinant of the covariance matrix
    log_det_cov = np.linalg.slogdet(cov)[1]

    # Compute the Mahalanobis distance for each instance (use no.linalg.pinv, the pseudoinverse)
    mahalanobis = np.sum(np.dot((X - mean), np.linalg.pinv(cov)) * (X - mean), axis=1)

    # Compute the log-pdf for each instance
    log_pdf = -0.5 * (log_det_cov + mahalanobis + len(mean) * np.log(2 * np.pi))
    return log_pdf

class NaiveBayesDisc:
    '''
    Naive Bayes classifiers for categorical input features (NB)
    '''
    def __init__(self, cardY, card, n):
        '''
        Constructor for NB

        Args:
            cardY: cardinality of the class variable
            card: cardinality of the discrete input features
            n: number of features
        '''
        self.cardY = cardY
        self.num_features = n
        self.card= card
        self.class_counts = np.zeros(cardY)
        self.feature_counts = np.zeros((n, card, cardY))
        self.class_probs = np.zeros(cardY)
        self.cond_probs = np.zeros((n, card, cardY))

    def getStats(self, X, Y):
        '''
        The statistics mapping for NB
        Args:
            X: training unlabeled instances
            Y: training class labels

        Returns:
            The statistics obtained from training data for NB: counting statistics
        '''
        m,n= X.shape
        class_counts = np.zeros(self.cardY)
        feature_counts = np.zeros((n, self.card, self.cardY))

        if Y.ndim == 1:
            # Compute class counts
            class_counts += np.bincount(Y, minlength=self.cardY)

            # Compute feature counts
            for j in range(self.num_features):
                feature_counts[j, :, :]= np.bincount(X[:,j]*self.cardY+ Y, minlength= self.card* self.cardY ).reshape((self.card,self.cardY))

        else:
            # Compute class counts
            class_counts += np.sum(Y,axis= 0)

            # Compute input feature counts
            for j in range(self.num_features):
                for x in range(self.card):
                    feature_counts[j, x, :] = np.sum(Y[X[:, j]==x,:], axis= 0)


        return class_counts, feature_counts

    def statsToParams(self, ess= 0):
        '''
        Parameter mapping for NB that gets and stores the parameters from the statistics.

        The statistics are obtained from the attributes: class_counts and feature_counts-

        Args:
            ess: when zero corresponds to maximum likelihood parameters. When positive corresponds to maximum a
            posteriori parameters with a uniform dirichlet prior with ess equivalent sample size
        '''

        if ess>0:
            # Priors for the maximum a posteriori estimates
            feature_counts_prior = np.ones((self.num_features, self.card, self.cardY)) * ess / (self.card * self.cardY)
            class_counts_prior = np.ones(self.cardY) * ess / self.cardY

        # Compute class probabilities
        if ess==0:
            self.class_probs = self.class_counts / np.sum(self.class_counts)
        elif ess>0:
            self.class_probs = (self.class_counts + class_counts_prior) / np.sum(self.class_counts + class_counts_prior)

        # Compute conditional probabilities
        for j in range(self.num_features):
            for f in range(self.card):
                for c in range(self.cardY):
                    if ess== 0:
                        self.cond_probs[j, f, c] = self.feature_counts[j, f, c] / self.class_counts[c]
                    elif ess>0:
                        self.cond_probs[j, f, c] = (self.feature_counts[j, f, c] + feature_counts_prior[j, f, c]) / (
                                    self.class_counts[c] + class_counts_prior[c])

    def fit(self, X, Y, ess= 0):
        '''
        The closed-form learning algorithm for NB.

        When ess=0 learn the maximum likelihood parameters. When ess>0 learn the maximum a posteriori parameters
        with a Dirichlet prior distribution with uniform hyperparameters and an equivalent sample size of ess

        Args:
            X: training unlabeled instances
            Y: training class labels
            ess: equivalent sample size of the Dirichlet prior

        '''
        self.class_counts,self.feature_counts= self.getStats(X, Y)
        self.statsToParams(ess)

    def riskCalib(self, X, Y, lr= 0.1, ess= 0, correct_forbidden_stats= True):
        '''
        Risk-based calibration learning algorithm for NB. It is based on the maximum likelihood/maximum a posteriori
        closed form learning algorithm ess=0/ess>0

        Args:
            X: training unlabeled instances
            Y: training class labels
            lr: learning rate
            ess: equivalent sample size of the Dirichlet prior
            correct_forbidden_stats: when true (default) it does not update the statistics with negative statistics

        '''

        oh = np.eye(self.cardY)[Y]
        pY = self.getClassProbs(X)

        dif = pY - oh

        # compute the change on the statistics
        d_class_counts, d_feature_counts = self.getStats(X, dif)

        self.class_counts -= lr * d_class_counts
        self.feature_counts -= lr * d_feature_counts

        if correct_forbidden_stats:
            if np.any(self.class_counts < 0):
                self.class_counts += lr * d_class_counts

            first, second= np.where((self.feature_counts < 0).any(axis=(1)))
            indices= list(zip(first, second))
            for j,y in indices:
                self.feature_counts[j, :, y] += lr * d_feature_counts[j, :, y]

        # update the parameters from the parameters
        self.statsToParams(ess)


    def gradDesc(self, X, Y, lr= 0.1, opt_normalization= 0):
        '''
        Gradient descent of the average negative log loss in training data for NB. The gradient is computed for the
        parameters (probabilities) in their logarithm form to avoid negative probabilities. Once the gradient descent
        is computed the parameters are projected into the simplex to guarantee that represent probabilities

        :param X: unlabeled instances
        :param Y: labels
        :param lr: learning rate
        :opt_normalization: normalization of the parameters to reflect probability tables.
            O (default): project into the simplex
            1: project using softmax
        :return:
        '''

        m= X.shape[0]

        # one-hot encoding of Y
        oh = np.zeros((m, self.cardY))
        oh[np.arange(m), Y] = 1

        pY= self.getClassProbs(X)
        dif= pY - oh

        d_alpha_y= np.sum(dif, axis= 0)/m
        alpha_y= np.log(self.class_probs)
        alpha_y -= lr * d_alpha_y

        if opt_normalization== 0:
            # Normalize the probability table by projecting into the simplex
            self.class_probs = project_onto_simplex(np.exp(alpha_y))
        else:
            # Normalization using softmax
            self.class_probs= softmax(alpha_y)

        d_beta_y= np.zeros(self.card)
        for j in np.arange(self.num_features):
            for c in np.arange(self.cardY):
                # For each probability table: feature k and class c
                for x in np.arange(self.card):
                    d_beta_y[x]= np.sum(dif[X[:,j]== x,c])/m

                beta_y = np.log(self.cond_probs[j,:,c])
                beta_y -= lr * d_beta_y

                if opt_normalization== 0:
                    # Normalize each probability table by projecting into the simplex
                    self.cond_probs[j, :, c] = project_onto_simplex(np.exp(beta_y))
                else:
                    #Normalization using softmax
                    self.cond_probs[j, :, c]= softmax(beta_y)

    def DFE(self, X, Y, lr= 0.1, ess= 0):
        '''
        Risk descent iterative learning algorithm.

        Args:
            X: Training unlabeled instances
            Y: Training class labels
            lr: learning rate
            correct_negative_stats:
                1) project negative statistics by using the simplex projection of the associated probabilities
                2) do not update the table with negative statistics

        Returns:

        '''

        m= X.shape[0]
        pY = self.getClassProbs(X)
        E= np.zeros((m,self.cardY))
        E[np.arange(m), Y] = 1- pY[np.arange(m), Y]

        # compute the change on the statistics
        d_class_counts, d_feature_counts = self.getStats(X, E)

        self.class_counts += lr * d_class_counts
        self.feature_counts += lr * d_feature_counts

        # update the parameters from the parameters
        self.statsToParams(ess)

    def getClassProbs(self, X):
        m,n= X.shape
        log_pY= np.zeros((m, self.cardY))
        for y in np.arange(self.cardY):
            log_pY[:, y]= np.log(self.class_probs[y])
            for j in np.arange(n):
                log_pY[:, y] += np.log(self.cond_probs[j, X[:,j], y])
        return softmax(log_pY, axis= 1)

    def predict(self, X):
        pY= self.getClassProbs(X)
        return np.argmax(pY, axis= 1)

class QDA:
    '''
    Quadratic discriminant analysis (QDA)
    '''

    def __init__(self, cardY, n):
        self.cardY= cardY
        self.n= n

        self.m_y= None
        self.s1_y= None
        self.s2_y= None
        self.p_y = None
        self.mean_y = None
        self.cov_y = None

    def getStats(self, X, Y):

        m_y = np.zeros(self.cardY)
        s1_y = np.zeros((self.cardY, self.n))
        s2_y = np.zeros((self.cardY, self.n, self.n))

        if Y.ndim == 1:
            # Compute class counts
            for c in range(self.cardY):
                m_y[c] = np.sum(Y == c)

            # Compute cond moments1
            for c in np.arange(self.cardY):
                X_c= X[Y==c, :]
                s1_y[c,:]=np.sum(X_c, axis= 0)
                s2_y[c, :]= X_c.transpose() @ X_c

        else:
            # Compute class counts
            m_y = np.sum(Y, axis=0)

            for c in np.arange(self.cardY):
                s1_y[c, :]= X.transpose() @ Y[:,c]
                s2_y[c, :]= (X.transpose() * Y[:,c]) @ X

        return m_y, s1_y, s2_y

    def statsToParams(self, ess= 0, mean0= 0, w_mean0=0, cov0= 1, w_cov0= 0):
        '''
        Updates the parameters of the QDA using the sufficient statistics in self.m_y, self.s1_y and self.s2_y

        Args:
            ess: equivalent sample size for the Dirichlet prior of the class distribution
            mean0: Prior over the mean.
            w_mean0: Weight for the prior over the mean
            cov0: Prior for the variance.
            w_cov0: Weight for the prior over the covariance matrix
        '''

        #Marginal probabilitiy class distribution
        m_y_prior= np.ones(self.cardY)*ess/self.cardY
        self.p_y= (self.m_y + m_y_prior)/np.sum(self.m_y+ m_y_prior)

        if w_mean0>0:
            # Prior parameters
            prior_mean = np.ones(self.n)* mean0 # Prior mean matrix
            prior_cov = np.eye(self.n)*cov0  # Prior covariance matrix


        self.mean_y= np.zeros((self.cardY,self.n))
        self.cov_y= np.zeros((self.cardY,self.n,self.n))
        if w_mean0== 0:
            for c in np.arange(self.cardY):
                self.mean_y[c,:]= self.s1_y[c,:]/self.m_y[c]
                self.cov_y[c,:,:]= self.s2_y[c,:,:]/self.m_y[c] - np.outer(self.mean_y[c,:],self.mean_y[c,:])
        elif w_mean0>0:
            for c in np.arange(self.cardY):
                mu= self.s1_y[c,:]/self.m_y[c]
                cov= self.s2_y[c,:,:]/self.m_y[c] - np.outer(self.mean_y[c,:],self.mean_y[c,:])

                self.mean_y[c,:]= (mu * self.m_y[c] + w_mean0 * prior_mean)/(self.m_y[c] + w_mean0)
                self.cov_y[c, :, :]= (self.m_y[c]* cov + w_cov0 * prior_cov)/(self.m_y[c] + w_cov0)

    def fit(self,X,Y, ess= 0, mean0= 0, w_mean0=0, cov0= 1, w_cov0= 0):
        self.m_y, self.s1_y, self.s2_y= self.getStats(X,Y)
        self.statsToParams(ess= ess, mean0= mean0, w_mean0=w_mean0, cov0= cov0, w_cov0= cov0)

    def getClassProbs(self, X):

        m= X.shape[0]
        pY= np.zeros((m,self.cardY))

        for c in np.arange(self.cardY):
            log_mvn= log_multivariate_gaussian(X= X, mean=self.mean_y[c,:], cov=self.cov_y[c,:,:])
            # avoid infinity
            log_mvn[np.where(np.isinf(log_mvn))] = -10 ** 12
            pY[:, c] = log_mvn + np.log(self.p_y[c])

        pY= softmax(pY, axis=1)
        return pY

    def riskCalib(self, X, Y, lr= 0.1, ess= 0, mean0= 0, w_mean0=0, cov0= 1, w_cov0= 0, correct_forbidden_stats= False):

        oh = np.eye(self.cardY)[Y]
        pY = self.getClassProbs(X)

        dif = pY - oh

        # compute the change on the statistics
        d_m_y, d_s1_y, d_s2_y = self.getStats(X, dif)
        self.m_y -= lr * d_m_y
        self.s1_y -= lr * d_s1_y
        self.s2_y -= lr * d_s2_y


        if correct_forbidden_stats:
            # If any statistic is not valid undo the changes

            # class marginal
            if np.any(self.m_y < 10**-3):
                self.m_y += lr * d_m_y

            # class conditional covariance
            for y in range(self.cardY):
                if np.any(np.diag(self.s2_y[y]) <= 0):
                    self.s1_y += lr * d_s1_y
                    self.s2_y += lr * d_s2_y

        # update the parameters
        self.statsToParams(ess, mean0, w_mean0, cov0, w_cov0)

    def gradDesc(self, X, Y, lr= 0.1, psd_cov= True):

        m,n= X.shape

        # one-hot encoding of Y
        oh = np.zeros((m, self.cardY))
        oh[np.arange(m), Y] = 1
        pY= self.getClassProbs(X)

        dif= pY - oh
        # avoid class conditional probabilities with NaNs.
        nan_rows= np.isnan(dif).any(axis= 1)
        dif= dif[~nan_rows, :]
        X= X[~nan_rows, :]
        m= dif.shape[0]
        sum= np.sum(dif, axis= 0)


        # Change the parameters to the exponential form
        # log p(y)
        nu_0= np.log(self.p_y)
        #cov^-1 * mu
        nu_1= np.zeros_like(self.mean_y)
        #-1/2 * cov^-1
        nu_2= np.zeros_like(self.cov_y)
        nu_2_inv= np.zeros_like(self.cov_y)
        for c in range(self.cardY):
            inv_cov= np.linalg.pinv(self.cov_y[c])
            nu_1[c]= np.dot(inv_cov, self.mean_y[c])
            nu_2[c]= -0.5 * inv_cov
            nu_2_inv[c]= -2 * self.cov_y[c]


        # Compute the gradients of the parameters in the exponential form
        d_nu_0= sum/m

        #d R(nu)/d nu_1|d=  1/m sum_{x,y} [y==d](x + 1/2 nu_2|d^-1 \nu_1|d
        d_nu_1= np.einsum('ai,aj->ij', dif, X)/m
        for c in np.arange(self.cardY):
            d_nu_1[c]+= 0.5* sum[c] * np.matmul(nu_2_inv[c], nu_1[c])/m


        #d R(nu)/d nu_2|d= 1/m sum_{x,y} (h(d|x) - [d==y])(x·x^t - 1/4 · nu_2|d^-1 · nu_1,d · nu_1|d^t · nu_2|d^-1 + 1/2 tr(nu_2|d^-1)
        d_nu_2= np.zeros((self.cardY, self.n, self.n))
        for c in np.arange(self.cardY):
            d_nu_2[c]= np.einsum('a,ai,aj->ij', dif[:,c], X, X)/m
            d_nu_2[c]+= -0.25 * sum[c] * np.dot(np.dot(nu_2_inv[c], np.outer(nu_1[c],nu_1[c])), nu_2_inv[c])/m
            d_nu_2[c]+= 0.5* sum[c] * np.trace(nu_2_inv[c])/m

        # Apply the gradient
        nu_0 -= lr * d_nu_0
        nu_1 -= lr * d_nu_1
        nu_2 -= lr * d_nu_2

        # Transform the parameters into the standard form and transorm to get valid param values: probs, means and covariances
        # Get probabilities using softmax
        # p_y= exp{nu_0}
        self.p_y = softmax(nu_0)
        epsilon= 10**-3
        for c in range(self.cardY):
            #mu(y)= -1/2 nu_2(y)^-1 nu_1(y)
            self.mean_y[c]= -0.5* np.linalg.inv(nu_2[c]).dot(nu_1[c])

            #cov(y)= -1/2 nu_2(y)^-1
            self.cov_y[c]= -0.5* np.linalg.inv(nu_2[c])

            if psd_cov:
                if self.n< 50:
                    # Get the closes covarianze matrix using eigenvalue decomposition
                    self.cov_y[c] = closest_non_singular_matrix(self.cov_y[c], epsilon= epsilon)
                else:
                    np.fill_diagonal(self.cov_y[c], np.maximum(np.diagonal(self.cov_y[c]), epsilon))

    def predict(self, X):
        pY= self.getClassProbs(X)
        return np.argmax(pY, axis= 1)


class LogReg:
    '''
    Logistic regression classifier
    '''
    def __init__(self,n,cardY,canonical= False):
        '''
        Constructor for the multiclass logistic regression classifier
        -All the variables are considered continuous and the feature vector corresponds to the identity, i.e.,phi(x)=(1,x)

        :param n: the number of predictor variables
        :param cardY: the number of class labels
        :param canonical: the model is given in the exponential canonical form. It includes the log partition function
        implemented in the procedure A. When canonical= False
        '''
        self.n= n
        self.cardY= cardY
        self.canonical= canonical

    def fit(self, X, Y, ess= 0, mean0= 0, w_mean0=0, cov0= 1, w_cov0= 0):
        '''
        Closed-form learning algorithm for LR

        Args:
            X: training unlabeled instances
            Y: training class labels
            ess:
            mean0:
            w_mean0:
            cov0:
            w_cov0:

        Returns:
        '''
        self.m, self.my, self.sy, self.s2y= self.getStats(X,Y)
        self.statsToParams(self.m, self.my, self.sy, self.s2y, ess, mean0, w_mean0, cov0, w_cov0)

    def exponent(self, X):
        '''
        Compute the exponents that determine the class probabilities
        h(y|x) propto np.exp(self.exponent(x)[y])

        :param X: unlabeled instances
        :return: the logarithm of the class probabilities for each unlabeled instance, log h(y|x)
        '''
        if self.canonical:
            return self.alpha_y + X.dot(self.beta_y.transpose()) - self.A()
        else:
            return self.alpha_y + X.dot(self.beta_y.transpose())

    def A(self):
        '''
        The log partition function for the logistic regression in the canonical form of the exponential
        family (self.canonical=True) under the equivalence with Gaussian naive Bayes with homocedasticity assumption

        Note that the terms independent of the class label are canceled in the softmax

        -See https://en.wikipedia.org/wiki/Exponential_family#Table_of_distributions normal distribution
        -The parameters self.kappa (one for ech predictor) correspond to -1/sigma^2.
        :return:
        '''
        A = -np.sum(self.beta_y ** 2 / (2 * self.kappa), axis=1)  # - np.sum(1/2 * np.log(-2 * self.kappa))
        return A

    def getClassProbs(self,X):
        '''
        Get the predicted conditional class distribution for a set of unlabeled instances

        X: unlabeled instances
        '''

        if X.ndim>1:
            return softmax(self.exponent(X), axis=1)
        else:
            return softmax(self.exponent(X))

    def predict(self,X):
        '''
        return the class labels with maximum probability for the given unlabeled instances

        :param X: unlabeled instances
        :return: an array with the labels with maximum probability
        '''
        return np.argmax(self.getClassProbs(X), axis=1)

    def initialization(self):
        '''
        Standard initialization for the logistic regression: all the parameters are set to zero

        Warning: this is incompatible with the canonical form, self.canonical = True
        '''
        self.beta_y= np.zeros((self.cardY,self.n))
        self.alpha_y= np.zeros(self.cardY)

    def statsToParams(self, m, my, sy, s2y, ess= 0, mean0= 0, w_mean0=0, cov0= 1, w_cov0= 0):
        '''
        Parameter mapping for the LR given the input statistics stored at attributes. Depending on the values of
        ess, w_mean0, w_cov0 maximum likelihood (values = 0) or maximum a posteriori (values > 0) parameters are
        computed

        Args:
            m: number of instances
            my: number of instances of class y
            sy: sum of x for the instances of class y
            s2y: sum of x^2 for the instances of class y
            ess: equivalent sample size of the Dirichlet uniform prior for the class marginal distribucion
            mean0: the prior mean vector conditioned to each class label
            w_mean0: equivalent sample size for prior mean vector conditioned to each class label
            cov0: the prior variance vector
            w_cov0: equivalent sample size for prior variance vector

        Returns:

        '''

        # Parameters for Gaussian naive Bayes under homocedasticity
        if ess== 0:
            self.py = my / m
        else:
            m_y_prior = np.ones(self.cardY) * ess / self.cardY
            self.py= (my+ m_y_prior)/(m + ess)

        if w_mean0==0:
            self.mu_y= sy/np.repeat(my, self.n).reshape(sy.shape)
        elif w_mean0>0:
            prior_mean= np.ones((self.cardY, self.n))*mean0
            self.mu_y = (sy +prior_mean* w_mean0) /(np.repeat(my, self.n).reshape(sy.shape)+ w_mean0)

        if w_cov0==0:
            # s2/m - sum_y m_y/m * mu_y^2
            self.v=  np.sum(s2y,axis=0)/m - self.py.transpose().dot(self.mu_y**2)
        elif w_cov0>0:
            var= (np.sum(s2y,axis= 0) - np.sum(sy**2/np.tile(my[:,np.newaxis],(1,self.n)), axis= 0))/m
            prior_cov= np.ones(self.n)* cov0
            self.v= (var * m + prior_cov* w_cov0)/(m + w_cov0)

        self.standardToNatural()

    def standardToNatural(self):
        '''
        Transform the parameters to the exponential form of the model

        Depending on the form of the logistic regression (self.canonical) the log partition function (false) is included
        into the parameters associated to the class marginal distribution.
        '''
        self.beta_y= self.mu_y/self.v
        if self.canonical:
            self.alpha_y = np.log(self.py)
            # kappa is used to compute the log partition function, A
            self.kappa= -1/self.v
        else:
            # When logistic regression is not in the canonical form the log partition function is included
            # in the parameters associated to the marginal of the class distribution, log(p(Y)). The terms
            # independent from y in the log partition function are canceled
            self.alpha_y = np.log(self.py) - np.sum(self.mu_y**2/(2 * self.v),axis=1)

    def getStats(self,X,Y):
        '''
        Statistics mapping of the training data for LR

        Args:
            X: training unlabeled instances
            Y: training class labels

        return: number of instances, number of instances for each class, sum of x for each input feature conditioned to
        each class label, sum of x^2 for each input feature conditioned to each class label class
        '''

        if Y.ndim== 1:
            # Maximum likelihood statistics
            m= float(Y.shape[0])
            my= np.unique(Y, return_counts=True)[1].astype(float)
            sy= np.row_stack([np.sum(X[Y==y,:],axis= 0) for y in np.arange(self.cardY)])
            s2y= np.row_stack([np.sum(X[Y==y,:]**2,axis= 0) for y in np.arange(self.cardY)])
        elif Y.ndim== 2:
            # Weighted maximum likelihood statistics
            pY= Y
            m= np.sum(pY)
            my= np.sum(pY, axis= 0)
            sy= X.transpose().dot(pY).transpose()
            s2y= (X**2).transpose().dot(pY).transpose()

        return (m,my,sy,s2y)

    def gradDesc(self, X, Y, lr= 0.1):
        '''
        Gradient descent of the average negative log loss in training data for LR

        Args:
            X: training unlabeled instances
            Y: training class labels
            lr: learning rate (default 0.1)
        :return:
        '''

        m= X.shape[0]

        # one-hot encoding of Y
        oh = np.zeros((m, self.cardY))
        oh[np.arange(m), Y] = 1

        pY= self.getClassProbs(X)
        dif= pY - oh
        if self.canonical:
            d_alpha = np.average(dif, axis=0)
            d_beta = dif.transpose().dot(X)/m + np.tile(np.average(dif,axis= 0)[:,np.newaxis],(1,self.n)) *self.beta_y/(2*self.kappa)
            d_kappa = np.sum(np.tile(np.average(dif,axis= 0)[:,np.newaxis],(1,self.n)) *(-self.beta_y**2),axis= 0)/(4*self.kappa**2)
        else:
            d_alpha = np.average(dif, axis=0)
            d_beta = dif.transpose().dot(X) / m

        self.beta_y -= lr * d_beta
        self.alpha_y -= lr * d_alpha
        if self.canonical:
            self.kappa -= lr * d_kappa


    def riskCalib(self, X, Y, lr= 0.1, ess= 0, mean0= 0, w_mean0=0, cov0= 1, w_cov0= 0, correct_forbidden_stats= True):
        '''
        Risk-based calibration for LR. The closed from algorithm learn maximum likelihood (ess=0, w_mean0= 0,
        w_cov0= 0) or the maximum a posteriori parameters (values grater than 0)

        args:
            X: training unlabeled instances
            Y: training class labels
            lr: learning rate
            ess: equivalent sample size of the Dirichlet uniform prior for the class marginal distribucion
            mean0: the prior mean vector for the normal-Whisart prior
            w_mean0: equivalent sample size for the Normal-Whisart prior of the mean vector conditioned to ech class
            label
            cov0: the variance prior for the normal-Whisart prior
            w_cov0: equivalent sample size for the Normal-Whisart prior of the variance
            correct_forbidden_stats: when true (default) it does not update the statistics with negative statistics
        '''

        m= X.shape[0]

        # one-hot encoding of Y
        oh= np.eye(self.cardY)[Y]
        pY = self.getClassProbs(X)
        dif = pY - oh

        # compute the change on the statistics
        d_m, d_my, d_sy, d_s2y = self.getStats(X,dif)
        self.m -= lr * d_m
        self.my -= lr * d_my
        self.sy -= lr * d_sy
        self.s2y -= lr * d_s2y

        if correct_forbidden_stats:
            if np.any(self.my <= 0):
                self.my += lr * d_my

        self.statsToParams(self.m, self.my, self.sy, self.s2y, ess, mean0, w_mean0, cov0, w_cov0)