MLE for a Polya Distribution

533 Views Asked by At

I'm working on programming a MLE for the Polya distribution using scipy. The Nelder-Mead method is working, however I get a "Desired error not necessarily achieved due to precision loss." error when running BFGS. The Nelder-Mead method seems like it's too slow for my needs (I have a lot of fairly big data, say 1000 tables in some cases as big as 10x10000). I've tried using the check_grad function and the result is smallish on the example below (order 10^-2), so I'm not sure if that means there's a bug in the gradient of the log likelihood or the likelihood is just very strongly peaked. For what it's worth, I've stared quite hard at my code and I can't see the issue. Here's some example code to recreate the problem

#setup some data
from numpy.random import dirichlet, multinomial
from scipy.optimize import check_grad

alpha = [10,30,50]
p = pd.DataFrame(dirichlet(alpha,200))
data = p.apply(lambda x: multinomial(500,x),1)
a = np.array(data.mean(0))

#optimize
result = minimize(lambda a: -1*llike(data,exp(a)),
    x0=np.log(a),
    method='Nelder-Mead')
x0=result.x
result = minimize(lambda a: -1*llike(data,exp(a)),
    x0=x0,
    jac=lambda a: -1*gradient_llike(data,np.exp(a)),
    method='BFGS')
exp(result.x) #should be close to alpha
#uhoh, let's check that this is right.
check_grad(func=lambda a: -1*llike(data,a),grad=lambda a: -1*gradient_llike(data,a),x0=alpha)

Here's the code for my functions

def log_polya(Z,alpha):
    """
    Z is a vector of counts
    https://en.wikipedia.org/wiki/Dirichlet-multinomial_distribution
    http://mimno.infosci.cornell.edu/info6150/exercises/polya.pdf
    """
    if not isinstance(alpha,np.ndarray):
        alpha = np.array(alpha)
    if not isinstance(Z,np.ndarray):
        Z = np.array(Z)
    #Concentration Parameter
    A = sum(alpha)
    #Number of Datapoints
    N = sum(Z)
    return gammaln(A) - gammaln(N+A) + sum(gammaln(Z+alpha) - gammaln(alpha))

def llike(data,alpha):
    return sum(data.apply(log_polya,1,alpha=alpha))

def log_polya_derivative(Z,alpha):
    if not isinstance(alpha,np.ndarray):
        alpha = np.array(alpha)
    if not isinstance(Z,np.ndarray):
        Z = np.array(Z)
    if 0. in Z+alpha:
        Warning("invalid prior parameter,nans should be produced")
    #Concentration Parameter
    A = sum(alpha)
    #Number of Datapoints
    N = sum(Z)
    K = len(Z)
    return np.array([psi(A) - psi(N+A) + psi(Z[i]+alpha[i]) - psi(alpha[i]) for i in xrange(K)])


def gradient_llike(data,alpha):
     return np.array(data.apply(log_polya_derivative,1,alpha=alpha).sum(0))

UPDATE: Still curious about this, but for those interested in a working implementation for this problem, the following code for implementing the Minka Fixed Point Algorithm seems to work well (i.e. recovers quickly values that are close to the true dirichlet parameter).

def minka_mle_polya(data):
    """
    http://research.microsoft.com/en-us/um/people/minka/papers/dirichlet/minka-dirichlet.pdf
    """
    data = np.array(data)
    K = np.shape(data)[1]
    alpha = np.array(data.mean(0))
    alpha_new = np.ndarray((K))
    precision = 10
    while precision > 10**-5:
        for k in range(K):
            A = sum(alpha)
            N = data.sum(1)
            numerator = sum(
                    psi(data[:,k]+alpha[k])-psi(alpha[k])
                    )
            denominator = sum(
                psi(N+A)-psi(A)
                )
            alpha_new[k] = alpha[k]*numerator/denominator
        precision = sum(abs(alpha_new - alpha))
        alpha_old = np.array(alpha)
        alpha = np.array(alpha_new)
        print "Gap", precision
0

There are 0 best solutions below