In the GitHub repository you will find a data set of the distribution of citations of scientific papers. Use SMC-ABC to fit a g-and-k distribution to this dataset. Perform all the necessary steps to find asuitable value for “epsilon” and ensuring the model converge and results provides a suitable fit.
this i the code i already wrote :
%matplotlib inline
import arviz as az
import matplotlib.pyplot as plt
import pymc as pm
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
from scipy import optimize
#reading data
data = pd.read_csv("sc.csv")
values = data[data.columns[0]].values
#define the G-and-K quantile class
class g_and_k_quantile:
def __init__(self):
self.quantile_normal = stats.norm(0, 1).ppf
def ppf(self, x, a, b, g, k):
z = self.quantile_normal(x)
return a + b * (1 + 0.8 * np.tanh(g * z / 2)) * ((1 + z**2)**k) * z
#step 2: estimate G-and-K parameters
def objective(params, data):
a, b, g, k = params
gk = g_and_k_quantile()
quantiles = np.linspace(0.01, 0.99, len(data)) #avoiding 0 and 1
theoretical = gk.ppf(quantiles, a, b, g, k)
empirical = np.quantile(data, quantiles)
return np.sum((theoretical - empirical) ** 2)
initial_params = [np.median(values), np.std(values), 0.1, 0.1]
result = optimize.minimize(objective, initial_params, args=(values,), method='Nelder-Mead')
a, b, g, k = result.x
#step 3
gk = g_and_k_quantile()
simulated_data = gk.ppf(np.random.uniform(0.01, 0.99, 1000), a, b, g, k)
# Plotting the original data
plt.figure(figsize=(10, 5))
plt.hist(values, bins=50, alpha=0.6, color='blue', label='Actual Data')
plt.xlabel('Value')
plt.ylabel('Count')
plt.title('Histogram of Actual Data')
plt.legend()
plt.show()
# Plotting the G-and-K distribution data
plt.figure(figsize=(10, 5))
plt.hist(simulated_data, bins=50, density=True, alpha=0.6, color='red', label='Simulated G-and-K')
plt.xlabel('Simulated Value')
plt.ylabel('Density')
plt.title('Histogram of Simulated G-and-K Distribution')
plt.legend()
plt.show()
import pandas as pd
import numpy as np
import pymc as pm
import arviz as az
import matplotlib.pyplot as plt
from scipy.stats import norm
# Make sure to import pandas for reading the CSV
data = pd.read_csv("sc.csv")
observed_data = data.iloc[:, 0].values
bsas_co = observed_data # Make sure bsas_co is the numpy array of observed values
class g_and_k_quantile:
def __init__(self):
self.quantile_normal = norm.ppf
def ppf(self, x, a, b, g, k):
z = self.quantile_normal(x)
return a + b * (1 + 0.8 * np.tanh(g * z / 2)) * ((1 + z**2)**k) * z
# Updated to accept rng and size
def gk_simulator(a, b, g, k):
return gk.rvs(len(bsas_co), a, b, g, k)
def octo_summary(data):
quantiles = np.quantile(data, [0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875])
sa = quantiles[3] # Median
sb = quantiles[5] - quantiles[1] # IQR
sg = (quantiles[5] + quantiles[1] - 2 * sa) / sb # Scaled difference
sk = (quantiles[6] - quantiles[4] + quantiles[2] - quantiles[0]) / sb # Scaled range
return np.array([sa, sb, sg, sk])
with pm.Model() as model:
# Define parameters
a = pm.Normal('a', mu=0, sigma=10)
b = pm.HalfNormal('b', sigma=5)
g = pm.Normal('g', mu=0, sigma=1)
k = pm.HalfNormal('k', sigma=1)
# Simulator
observed = pm.Simulator('observed', gk_simulator, params=[a, b, g, k],
epsilon=0.1, observed=observed_data)
# Sample using SMC
trace = pm.sample_smc(draws=1000)
az.plot_trace(trace_gk)
plt.show()
first of all the second part of the question "Perform all the necessary steps to find asuitable value for “epsilon” and ensuring the model converge and results provides a suitable fit." is not working, and i dont know if my solution is the right solution for both parts .. i will be happy if someone could help find the errors in my code, or if there is any solution of this book code so let me know
thanks in advance