I am trying to reimplement the original GAN paper by Ian Goodfellow et al. And I need to show that my implementation achieves same or similar results as the authors achieved. But I am not sure how to evaluate this metric. I took a look at their implementation but I got some funny results. In the paper they report 225 +- 2 on the MNIST for this metric, while the results I get are bellow -400000000. I thought that maybe the model is bad, but it generates really good images of MNIST digits.
Can someone tell me what am I doing wrong?
Bellow is the code which I used. I copied the part of the code from the official implementation.
Note: valid variable are images taken from the MNIST dataset.
def get_nll(x, parzen, batch_size=10):
"""
Credit: Yann N. Dauphin
"""
inds = range(x.shape[0])
n_batches = int(numpy.ceil(float(len(inds)) / batch_size))
print("N batches:", n_batches)
times = []
nlls = []
for i in range(n_batches):
begin = time.time()
nll = parzen(x[inds[i::n_batches]])
end = time.time()
times.append(end-begin)
nlls.extend(nll)
if i % 10 == 0:
print(i, numpy.mean(times), numpy.mean(nlls))
return numpy.array(nlls)
def log_mean_exp(a):
"""
Credit: Yann N. Dauphin
"""
max_ = a.max(1)
return max_ + T.log(T.exp(a - max_.dimshuffle(0, 'x')).mean(1))
def cross_validate_sigma(samples, data, sigmas, batch_size):
lls = []
for sigma in sigmas:
print("Sigma:", sigma)
parzen = theano_parzen(samples, sigma)
tmp = get_nll(data, parzen, batch_size = batch_size)
lls.append(numpy.asarray(tmp).mean())
del parzen
gc.collect()
ind = numpy.argmax(lls)
print(max(lls))
return sigmas[ind]
noise = torch.randn((10000, 100), device=device)
gen_model.eval()
gan_out = gen_model(noise)
sigma_range = numpy.logspace(-1., 0., num=10)
sigma = cross_validate_sigma(gan_out.reshape(10000,-1), valid[0:10000], sigma_range, 100)