Combined fit of two distributions

131 Views Asked by At

I have two separate distributions given by the functions P_multiple and P_single. I would like to combine the two and into one combined fit function which has a natural "transition" between the two. Do you have suggestions? Thank you!

Here is the plot I have versus what I'd like to get (in red).enter image description here

import numpy as np
import matplotlib.pyplot as plt

def P_multiple(alpha):
    return 1 / np.sqrt(np.pi) * np.exp(-alpha**2)

def P_single(alpha, Z):
    alpha_abs = np.abs(alpha)
    return 1 / (alpha_abs**3 * 8 * np.log(204 * Z**(-1/3)))

Z_silicon = 14 
alpha_threshold = 3 
alpha_values = np.linspace(-7, 7, 300)

multiple = P_multiple(alpha_values)
single = P_single(alpha_values, Z_silicon)

multiple[np.abs(alpha_values) > alpha_threshold] = 0
single[np.abs(alpha_values) <= alpha_threshold] = 0

combined_distribution = multiple + single

plt.figure(figsize=(10, 6))
plt.plot(alpha_values, multiple, label='Multiple')
plt.plot(alpha_values, single, label='Single')
plt.plot(alpha_values, combined_distribution, label='Combined', linestyle='--')

plt.legend()
plt.yscale('log') 
plt.show()

Later edit:

I have done a bit of digging for some simplified formula that might help in this respect. I found an answer in here. Following this, I've tried to generate some data and then fit it using scipy's curve_fit and minuit minimization. curve_fit seems to work to some extent, but the fit is not perfect. Minuit works to some extent, but the initial fit is translated on the y axis. I managed to use a scaling factor to make the fit visually ok, but the chi2 per degree of freedom is very bad. The scaling might not be what I need. Any suggestions? The code I paste below:

import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d
from scipy.integrate import cumtrapz
from scipy.optimize import curve_fit
from scipy.special import gamma
from iminuit import Minuit

def P_multiple(alpha):
    return 1 / np.sqrt(np.pi) * np.exp(-alpha**2)

def P_single(alpha, Z):
    alpha_abs = np.abs(alpha)
    return 1 / (alpha_abs**3 * 8 * np.log(204 * Z**(-1/3)))

# The fit function (sum of a Gaussian and a Student's t-distribution)
def fit_function(theta, N, a, mu, sigma_g, sigma, tail_parameter):
    gaussian_part = (1 - a) * np.exp(-((theta - mu) ** 2) / (2 * sigma_g ** 2)) / (sigma_g * np.sqrt(2 * np.pi))
    t_part = (a * gamma((tail_parameter + 1) / 2) / (np.sqrt(tail_parameter * np.pi) * sigma * gamma(tail_parameter / 2)) *
             np.power(1 + ((theta - mu) ** 2) / (tail_parameter * sigma ** 2), -(tail_parameter + 1) / 2))
    return N * (gaussian_part + t_part)

def binned_likelihood(params, theta, counts, bin_width):
    N, a, mu, sigma_g, sigma, tail_parameter = params
    expected_counts = fit_function(theta, N, a, mu, sigma_g, sigma, tail_parameter) * bin_width
    # Using Poisson likelihood
    return -2 * np.sum(counts * np.log(expected_counts) - expected_counts)

# Cost function for Minuit
def cost_function(N, a, mu, sigma_g, sigma, tail_parameter):
    params = [N, a, mu, sigma_g, sigma, tail_parameter]
    expected_counts = fit_function(bin_centers, *params) * bin_width
    expected_counts = np.maximum(expected_counts, 1e-6)  # Ensure non-zero expected counts
    cost = -2 * np.sum(counts * np.log(expected_counts) - expected_counts)
    return cost

Z_silicon = 14 
alpha_threshold = 3 
alpha_values = np.linspace(-7, 7, 300)

multiple = P_multiple(alpha_values)
single = P_single(alpha_values, Z_silicon)

multiple[np.abs(alpha_values) > alpha_threshold] = 0
single[np.abs(alpha_values) <= alpha_threshold-1.5] = 0

combined_distribution = multiple + single

# Cumulative Distribution Function (CDF) -- thanks chatGPT
cdf = cumtrapz(combined_distribution, alpha_values, initial=0)
cdf /= cdf[-1]  # normalized CDF
inverse_cdf = interp1d(cdf, alpha_values) # Interpolation function for the inverse CDF

# Data
num_samples = 100000
uniform_samples = np.random.uniform(0, 1, num_samples)
generated_samples = inverse_cdf(uniform_samples)

num_bins = 300
counts, bins = np.histogram(generated_samples, bins=num_bins)
bin_centers = 0.5 * (bins[1:] + bins[:-1])
errors = np.sqrt(counts)  # Poisson error per bin
bin_width = bins[1] - bins[0]


plt.figure(figsize=(10, 6))
plt.errorbar(bin_centers, counts, yerr=errors, fmt='o', label='Data with Error Bars', color='#1E88E5', ecolor='black', zorder=1)
plt.plot(alpha_values, combined_distribution * num_samples / np.sum(combined_distribution), label='Distribution from which data is drawn', linestyle='--', color ='red', zorder=2)

params = [4500, 0.02, 0, 0.7, 3, 5] # played quite a bit to get those
popt, pcov = curve_fit(fit_function, bin_centers, counts, p0=params, sigma=errors)

theta_values = np.linspace(min(generated_samples), max(generated_samples), 500)
fitted_function_values = fit_function(theta_values, *popt)
plt.plot(theta_values, fitted_function_values, label='Fitted function (curve_fit)', color='#004D40', zorder=3)

### minuit below
# total_samples = np.sum(counts)
# bin_width = bins[1] - bins[0]
# scale_factor = total_samples * bin_width
# print(scale_factor)
m = Minuit(cost_function, N=10000, a=0.3, mu=0, sigma_g=0.7, sigma=3, tail_parameter=3)
m.limits['sigma_g', 'sigma', 'tail_parameter'] = (1e-6, None) # parameter limits
m.errordef = Minuit.LEAST_SQUARES
m.migrad()
m.hesse()

fitted_params_minuit = [m.values[key] for key in m.parameters]
fitted_function_scaled = fit_function(theta_values, *fitted_params_minuit)*0.05
plt.plot(theta_values, fitted_function_scaled, label='Fitted Function (Minuit)', color='green')


plt.xlabel('Angle')
plt.ylabel('Counts')
plt.legend()
plt.yscale('log')
plt.ylim(0.5,1e4)
plt.show()

# Chi2 for curve_fit
adjusted_errors = np.where(errors == 0, 1, errors)
chi2_curve_fit = np.sum(((counts - fit_function(bin_centers, *popt)) / adjusted_errors) ** 2)
dof_curve_fit = len(counts) - len(popt)
chi2_per_dof_curve_fit = chi2_curve_fit / dof_curve_fit
print("curve_fit - Fitted parameters:", popt)
print("curve_fit - Chi-squared:", chi2_curve_fit)
print("curve_fit - Chi-squared per degree of freedom:", chi2_per_dof_curve_fit)

# Chi-squared for Minuit
chi2_minuit = np.sum(((counts - fit_function(bin_centers, *fitted_params_minuit)) / adjusted_errors) ** 2)
dof_minuit = len(counts) - len(fitted_params_minuit)
chi2_per_dof_minuit = chi2_minuit / dof_minuit
print("Minuit - Fitted parameters:", fitted_params_minuit)
print("Minuit - Chi-squared:", chi2_minuit)
print("Minuit - Chi-squared per degree of freedom:", chi2_per_dof_minuit)

Results:

curve_fit - Fitted parameters: [4.5e+03 2.0e-02 0.0e+00 7.0e-01 3.0e+00 5.0e+00]
curve_fit - Chi-squared: 1087.3770894509803
curve_fit - Chi-squared per degree of freedom: 3.698561528744831
Minuit - Fitted parameters: [100032.27417296321, 0.0780285341472854, -0.001452392031778836, 0.7125402555391692, 0.7478831473831798, 2.673494961387995]
Minuit - Chi-squared: 42899843.300199464
Minuit - Chi-squared per degree of freedom: 145917.83435441996

Resulting fit:

enter image description here

I followed the advice in the comments to look for a function, but I admit I don't really know what I'm doing here apart from trial and error :)

1

There are 1 best solutions below

1
On

There are a bunch of ways of doing this, but I thought I'd find a method that looked similar to what you showed in what you wanted, while I only implemented for one side, I've been on this for a bit too long now and it shouldn't be too hard to make it work I decided to implement this using bezier curves, which have the con of using a t variable which means it is harder to find exact values for a certain x or a certain y unless you have a high sample rate, as I didn't really know what you are implementing this for, I just went with this as this was my first thought to make it look the closest Bezier curves are essentially just a way of making a curve using three points

import numpy as np
import matplotlib.pyplot as plt

def P_multiple(alpha):
    return 1 / np.sqrt(np.pi) * np.exp(-alpha**2)

def P_single(alpha, Z):
    alpha_abs = np.abs(alpha)
    return 1 / (alpha_abs**3 * 8 * np.log(204 * Z**(-1/3)))

def findcrosspoint(error, dotpoints, valuem, values):
    #finds the closest point of crossing by approximation
    for i in range (dotpoints-1, 0, -1):
        if abs(values[i] - valuem[i]) <= error:
            return i
    return -1

def bezierline(startpoint, endpoint, t):
    # creates a bezier line
    return startpoint + t * (endpoint-startpoint)

def beziercurve(startpoint, crosspoint, endpoint, res):
    curvevals = []
    for i in range(res):
        if i != 0:
            t = i / res
        else:
            t = i
            
        # curves between two bezier lines
        curvevals.append(bezierline(startpoint, crosspoint, t) + t * (bezierline(crosspoint, endpoint, t) - bezierline(startpoint, crosspoint, t)))
    return np.array(curvevals)

def transition(Z, dotpoints, viewsize, startpoint, thresh, transitionrange, xweight, yweight):
    # calculates all the values
    xs = np.linspace(startpoint, startpoint+viewsize, dotpoints)
    valuem = P_multiple(xs)
    values = P_single(xs, Z)
    transitionpoint = thresh

    # calculates start, cross and endpoints to calculate the bezier curve
    startpoint = transitionpoint - transitionrange
    startpoint = np.array([startpoint, np.log(P_multiple(startpoint))])
    crosspoint = findcrosspoint(0.1, dotpoints, np.log(valuem), np.log(values)) * (viewsize/dotpoints)
    
    # the x and y weight allow you to change the position of the cross point which can change the look of the curve
    crosspoint = np.array([crosspoint / xweight, np.log(P_multiple(crosspoint) / yweight)])
    plt.scatter(np.array([crosspoint[0]]), np.array([crosspoint[1]]), s=20, label='cross')
    plt.scatter(np.array([startpoint[0]]), np.array([startpoint[1]]), s=20, label='start')
    endpoint = transitionpoint + transitionrange
    endpoint = np.array([endpoint, np.log(P_single(endpoint, Z))])
    plt.scatter(np.array([endpoint[0]]), np.array([endpoint[1]]), s=20, label='end')

    # calculates the bezier curve values
    curvevals = beziercurve(startpoint, crosspoint, endpoint, 10000)
    # puts them all back into one array
    values = []
    inserted = False
    for i in range(xs.shape[0]):
        if xs[i] <= startpoint[0]:
            values.append([xs[i], np.log(P_multiple(xs[i]))])
        elif xs[i] >= endpoint[0]:
            values.append([xs[i], np.log(P_single(xs[i], Z))])
        elif inserted == False:
            for value in curvevals:
                values.append(np.array([value[0], value[1]]))
            inserted = True
    return np.array(values)


Z_silicon = 14
alpha_threshold = 3
alpha_values = np.linspace(-7, 7, 500)

multiple = P_multiple(alpha_values)
single = P_single(alpha_values, Z_silicon)

multiple[np.abs(alpha_values) > alpha_threshold] = 0
single[np.abs(alpha_values) <= alpha_threshold] = 0

combined_distribution = multiple + single

plt.figure(figsize=(10, 6))
plt.plot(alpha_values, np.log(multiple), label='Multiple')
plt.plot(alpha_values, np.log(single), label='Single')
values = transition(Z_silicon, 1000, 7, 0, 3, 1, 1, 1)
xs = np.zeros(values.shape[0])
ys = np.zeros(values.shape[0])
for i in range(values.shape[0]):
    xs[i] = values[i][0]
    ys[i] = values[i][1]
plt.plot(xs, ys, label='combined')

plt.legend()
plt.show()

While this code is crude, it does closely match what you drew in your graph. It is fairly adjustable so you can make it look more to your liking

1.

2.

EDIT: forgot to mention this looked fairly weird when i used it with a log scale so i logged the values instead, I'm sorry if this is an inconvenience