I have two separate distributions given by the functions P_multiple
and P_single
. I would like to combine the two and into one combined fit function which has a natural "transition" between the two. Do you have suggestions? Thank you!
Here is the plot I have versus what I'd like to get (in red).
import numpy as np
import matplotlib.pyplot as plt
def P_multiple(alpha):
return 1 / np.sqrt(np.pi) * np.exp(-alpha**2)
def P_single(alpha, Z):
alpha_abs = np.abs(alpha)
return 1 / (alpha_abs**3 * 8 * np.log(204 * Z**(-1/3)))
Z_silicon = 14
alpha_threshold = 3
alpha_values = np.linspace(-7, 7, 300)
multiple = P_multiple(alpha_values)
single = P_single(alpha_values, Z_silicon)
multiple[np.abs(alpha_values) > alpha_threshold] = 0
single[np.abs(alpha_values) <= alpha_threshold] = 0
combined_distribution = multiple + single
plt.figure(figsize=(10, 6))
plt.plot(alpha_values, multiple, label='Multiple')
plt.plot(alpha_values, single, label='Single')
plt.plot(alpha_values, combined_distribution, label='Combined', linestyle='--')
plt.legend()
plt.yscale('log')
plt.show()
Later edit:
I have done a bit of digging for some simplified formula that might help in this respect. I found an answer in here.
Following this, I've tried to generate some data and then fit it using scipy's curve_fit
and minuit
minimization. curve_fit
seems to work to some extent, but the fit is not perfect. Minuit works to some extent, but the initial fit is translated on the y axis. I managed to use a scaling factor to make the fit visually ok, but the chi2 per degree of freedom is very bad. The scaling might not be what I need.
Any suggestions? The code I paste below:
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d
from scipy.integrate import cumtrapz
from scipy.optimize import curve_fit
from scipy.special import gamma
from iminuit import Minuit
def P_multiple(alpha):
return 1 / np.sqrt(np.pi) * np.exp(-alpha**2)
def P_single(alpha, Z):
alpha_abs = np.abs(alpha)
return 1 / (alpha_abs**3 * 8 * np.log(204 * Z**(-1/3)))
# The fit function (sum of a Gaussian and a Student's t-distribution)
def fit_function(theta, N, a, mu, sigma_g, sigma, tail_parameter):
gaussian_part = (1 - a) * np.exp(-((theta - mu) ** 2) / (2 * sigma_g ** 2)) / (sigma_g * np.sqrt(2 * np.pi))
t_part = (a * gamma((tail_parameter + 1) / 2) / (np.sqrt(tail_parameter * np.pi) * sigma * gamma(tail_parameter / 2)) *
np.power(1 + ((theta - mu) ** 2) / (tail_parameter * sigma ** 2), -(tail_parameter + 1) / 2))
return N * (gaussian_part + t_part)
def binned_likelihood(params, theta, counts, bin_width):
N, a, mu, sigma_g, sigma, tail_parameter = params
expected_counts = fit_function(theta, N, a, mu, sigma_g, sigma, tail_parameter) * bin_width
# Using Poisson likelihood
return -2 * np.sum(counts * np.log(expected_counts) - expected_counts)
# Cost function for Minuit
def cost_function(N, a, mu, sigma_g, sigma, tail_parameter):
params = [N, a, mu, sigma_g, sigma, tail_parameter]
expected_counts = fit_function(bin_centers, *params) * bin_width
expected_counts = np.maximum(expected_counts, 1e-6) # Ensure non-zero expected counts
cost = -2 * np.sum(counts * np.log(expected_counts) - expected_counts)
return cost
Z_silicon = 14
alpha_threshold = 3
alpha_values = np.linspace(-7, 7, 300)
multiple = P_multiple(alpha_values)
single = P_single(alpha_values, Z_silicon)
multiple[np.abs(alpha_values) > alpha_threshold] = 0
single[np.abs(alpha_values) <= alpha_threshold-1.5] = 0
combined_distribution = multiple + single
# Cumulative Distribution Function (CDF) -- thanks chatGPT
cdf = cumtrapz(combined_distribution, alpha_values, initial=0)
cdf /= cdf[-1] # normalized CDF
inverse_cdf = interp1d(cdf, alpha_values) # Interpolation function for the inverse CDF
# Data
num_samples = 100000
uniform_samples = np.random.uniform(0, 1, num_samples)
generated_samples = inverse_cdf(uniform_samples)
num_bins = 300
counts, bins = np.histogram(generated_samples, bins=num_bins)
bin_centers = 0.5 * (bins[1:] + bins[:-1])
errors = np.sqrt(counts) # Poisson error per bin
bin_width = bins[1] - bins[0]
plt.figure(figsize=(10, 6))
plt.errorbar(bin_centers, counts, yerr=errors, fmt='o', label='Data with Error Bars', color='#1E88E5', ecolor='black', zorder=1)
plt.plot(alpha_values, combined_distribution * num_samples / np.sum(combined_distribution), label='Distribution from which data is drawn', linestyle='--', color ='red', zorder=2)
params = [4500, 0.02, 0, 0.7, 3, 5] # played quite a bit to get those
popt, pcov = curve_fit(fit_function, bin_centers, counts, p0=params, sigma=errors)
theta_values = np.linspace(min(generated_samples), max(generated_samples), 500)
fitted_function_values = fit_function(theta_values, *popt)
plt.plot(theta_values, fitted_function_values, label='Fitted function (curve_fit)', color='#004D40', zorder=3)
### minuit below
# total_samples = np.sum(counts)
# bin_width = bins[1] - bins[0]
# scale_factor = total_samples * bin_width
# print(scale_factor)
m = Minuit(cost_function, N=10000, a=0.3, mu=0, sigma_g=0.7, sigma=3, tail_parameter=3)
m.limits['sigma_g', 'sigma', 'tail_parameter'] = (1e-6, None) # parameter limits
m.errordef = Minuit.LEAST_SQUARES
m.migrad()
m.hesse()
fitted_params_minuit = [m.values[key] for key in m.parameters]
fitted_function_scaled = fit_function(theta_values, *fitted_params_minuit)*0.05
plt.plot(theta_values, fitted_function_scaled, label='Fitted Function (Minuit)', color='green')
plt.xlabel('Angle')
plt.ylabel('Counts')
plt.legend()
plt.yscale('log')
plt.ylim(0.5,1e4)
plt.show()
# Chi2 for curve_fit
adjusted_errors = np.where(errors == 0, 1, errors)
chi2_curve_fit = np.sum(((counts - fit_function(bin_centers, *popt)) / adjusted_errors) ** 2)
dof_curve_fit = len(counts) - len(popt)
chi2_per_dof_curve_fit = chi2_curve_fit / dof_curve_fit
print("curve_fit - Fitted parameters:", popt)
print("curve_fit - Chi-squared:", chi2_curve_fit)
print("curve_fit - Chi-squared per degree of freedom:", chi2_per_dof_curve_fit)
# Chi-squared for Minuit
chi2_minuit = np.sum(((counts - fit_function(bin_centers, *fitted_params_minuit)) / adjusted_errors) ** 2)
dof_minuit = len(counts) - len(fitted_params_minuit)
chi2_per_dof_minuit = chi2_minuit / dof_minuit
print("Minuit - Fitted parameters:", fitted_params_minuit)
print("Minuit - Chi-squared:", chi2_minuit)
print("Minuit - Chi-squared per degree of freedom:", chi2_per_dof_minuit)
Results:
curve_fit - Fitted parameters: [4.5e+03 2.0e-02 0.0e+00 7.0e-01 3.0e+00 5.0e+00]
curve_fit - Chi-squared: 1087.3770894509803
curve_fit - Chi-squared per degree of freedom: 3.698561528744831
Minuit - Fitted parameters: [100032.27417296321, 0.0780285341472854, -0.001452392031778836, 0.7125402555391692, 0.7478831473831798, 2.673494961387995]
Minuit - Chi-squared: 42899843.300199464
Minuit - Chi-squared per degree of freedom: 145917.83435441996
Resulting fit:
I followed the advice in the comments to look for a function, but I admit I don't really know what I'm doing here apart from trial and error :)
There are a bunch of ways of doing this, but I thought I'd find a method that looked similar to what you showed in what you wanted, while I only implemented for one side, I've been on this for a bit too long now and it shouldn't be too hard to make it work I decided to implement this using bezier curves, which have the con of using a t variable which means it is harder to find exact values for a certain x or a certain y unless you have a high sample rate, as I didn't really know what you are implementing this for, I just went with this as this was my first thought to make it look the closest Bezier curves are essentially just a way of making a curve using three points
While this code is crude, it does closely match what you drew in your graph. It is fairly adjustable so you can make it look more to your liking
1.
2.
EDIT: forgot to mention this looked fairly weird when i used it with a log scale so i logged the values instead, I'm sorry if this is an inconvenience