Distribution Fitter
by a5c-ai
Statistical distribution fitting skill for input modeling in simulation and analysis.
Skill Details
Repository Files
1 file in this skill directory
name: distribution-fitter description: Statistical distribution fitting skill for input modeling in simulation and analysis. allowed-tools: Bash(*) Read Write Edit Glob Grep WebFetch metadata: author: babysitter-sdk version: "1.0.0" category: simulation backlog-id: SK-IE-006
distribution-fitter
You are distribution-fitter - a specialized skill for fitting statistical distributions to data for input modeling in simulation and analysis.
Overview
This skill enables AI-powered distribution fitting including:
- Goodness-of-fit testing (Chi-square, K-S, Anderson-Darling)
- Maximum likelihood estimation
- Distribution parameter estimation
- Inter-arrival time analysis
- Service time distribution fitting
- Empirical distribution construction
- Distribution comparison and selection
Prerequisites
- Python 3.8+ with scipy, fitter installed
- Statistical analysis libraries
- Understanding of probability distributions
Capabilities
1. Automated Distribution Fitting
from fitter import Fitter
import numpy as np
def fit_distribution(data, distributions=None):
"""
Fit multiple distributions and select best fit
"""
if distributions is None:
distributions = ['norm', 'expon', 'gamma', 'lognorm',
'weibull_min', 'beta', 'uniform', 'triang']
f = Fitter(data, distributions=distributions)
f.fit()
# Get summary
summary = f.summary()
# Best distribution
best = f.get_best(method='sumsquare_error')
return {
"best_distribution": list(best.keys())[0],
"parameters": best,
"summary": summary.to_dict(),
"all_fits": f.fitted_param
}
2. Goodness-of-Fit Testing
from scipy import stats
import numpy as np
def goodness_of_fit_tests(data, distribution, params):
"""
Perform multiple goodness-of-fit tests
"""
results = {}
# Kolmogorov-Smirnov test
ks_stat, ks_pvalue = stats.kstest(data, distribution, args=params)
results['kolmogorov_smirnov'] = {
'statistic': ks_stat,
'p_value': ks_pvalue,
'conclusion': 'accept' if ks_pvalue > 0.05 else 'reject'
}
# Chi-square test
observed, bins = np.histogram(data, bins='auto')
dist = getattr(stats, distribution)
expected = len(data) * np.diff(dist.cdf(bins, *params))
# Combine bins with low expected counts
mask = expected >= 5
chi2_stat, chi2_pvalue = stats.chisquare(
observed[mask], expected[mask]
)
results['chi_square'] = {
'statistic': chi2_stat,
'p_value': chi2_pvalue,
'degrees_of_freedom': sum(mask) - len(params) - 1
}
# Anderson-Darling test (for specific distributions)
if distribution in ['norm', 'expon', 'gumbel', 'logistic']:
ad_result = stats.anderson(data, dist=distribution)
results['anderson_darling'] = {
'statistic': ad_result.statistic,
'critical_values': dict(zip(
['15%', '10%', '5%', '2.5%', '1%'],
ad_result.critical_values
))
}
return results
3. Maximum Likelihood Estimation
from scipy.optimize import minimize
from scipy import stats
def mle_fit(data, distribution):
"""
Fit distribution using maximum likelihood
"""
dist = getattr(stats, distribution)
# Get parameter bounds
bounds = get_parameter_bounds(distribution)
# Negative log-likelihood function
def neg_log_likelihood(params):
return -np.sum(dist.logpdf(data, *params))
# Initial guess
x0 = get_initial_params(data, distribution)
# Optimize
result = minimize(neg_log_likelihood, x0, bounds=bounds,
method='L-BFGS-B')
# Standard errors via Hessian
from scipy.optimize import approx_fprime
hessian = np.zeros((len(result.x), len(result.x)))
epsilon = 1e-5
for i in range(len(result.x)):
hessian[i] = approx_fprime(result.x,
lambda p: approx_fprime(p, neg_log_likelihood, epsilon)[i],
epsilon)
se = np.sqrt(np.diag(np.linalg.inv(hessian)))
return {
"distribution": distribution,
"parameters": result.x.tolist(),
"standard_errors": se.tolist(),
"log_likelihood": -result.fun,
"aic": 2 * len(result.x) + 2 * result.fun,
"bic": len(result.x) * np.log(len(data)) + 2 * result.fun
}
4. Inter-arrival Time Analysis
def analyze_interarrival_times(timestamps):
"""
Analyze inter-arrival times from timestamp data
"""
# Calculate inter-arrival times
timestamps = np.array(timestamps)
interarrivals = np.diff(timestamps)
# Basic statistics
stats_summary = {
"count": len(interarrivals),
"mean": np.mean(interarrivals),
"std": np.std(interarrivals),
"cv": np.std(interarrivals) / np.mean(interarrivals),
"min": np.min(interarrivals),
"max": np.max(interarrivals),
"median": np.median(interarrivals)
}
# Fit exponential (Poisson process test)
exp_params = stats.expon.fit(interarrivals, floc=0)
ks_stat, ks_pvalue = stats.kstest(interarrivals, 'expon', args=exp_params)
is_poisson = ks_pvalue > 0.05 and 0.8 < stats_summary['cv'] < 1.2
# Fit other distributions
fit_result = fit_distribution(interarrivals)
return {
"statistics": stats_summary,
"poisson_process_test": {
"ks_statistic": ks_stat,
"p_value": ks_pvalue,
"cv_test": stats_summary['cv'],
"is_poisson": is_poisson
},
"best_fit": fit_result,
"arrival_rate": 1 / stats_summary['mean']
}
5. Empirical Distribution
class EmpiricalDistribution:
"""
Create empirical distribution from data
"""
def __init__(self, data):
self.data = np.sort(data)
self.n = len(data)
self.ecdf = np.arange(1, self.n + 1) / self.n
def cdf(self, x):
"""Cumulative distribution function"""
return np.searchsorted(self.data, x, side='right') / self.n
def ppf(self, q):
"""Percent point function (inverse CDF)"""
idx = int(q * self.n)
return self.data[min(idx, self.n - 1)]
def sample(self, size=1):
"""Generate random samples"""
u = np.random.uniform(0, 1, size)
return np.array([self.ppf(ui) for ui in u])
def to_dict(self):
"""Export for storage"""
return {
"type": "empirical",
"values": self.data.tolist(),
"probabilities": self.ecdf.tolist()
}
6. Distribution Comparison
def compare_distributions(data, candidates):
"""
Compare multiple distribution fits
"""
results = []
for dist_name in candidates:
try:
dist = getattr(stats, dist_name)
params = dist.fit(data)
# Log-likelihood
ll = np.sum(dist.logpdf(data, *params))
# Information criteria
k = len(params)
n = len(data)
aic = 2 * k - 2 * ll
bic = k * np.log(n) - 2 * ll
# KS test
ks_stat, ks_pvalue = stats.kstest(data, dist_name, args=params)
results.append({
"distribution": dist_name,
"parameters": params,
"log_likelihood": ll,
"aic": aic,
"bic": bic,
"ks_statistic": ks_stat,
"ks_pvalue": ks_pvalue
})
except Exception as e:
continue
# Sort by AIC
results.sort(key=lambda x: x['aic'])
return {
"rankings": results,
"best_by_aic": results[0]['distribution'],
"best_by_bic": min(results, key=lambda x: x['bic'])['distribution']
}
Process Integration
This skill integrates with the following processes:
discrete-event-simulation-modeling.jsqueuing-system-analysis.jsdemand-forecasting-model-development.js
Output Format
{
"data_summary": {
"n": 500,
"mean": 5.2,
"std": 2.1,
"cv": 0.40
},
"best_fit": {
"distribution": "gamma",
"parameters": {"shape": 6.1, "scale": 0.85},
"goodness_of_fit": {
"ks_statistic": 0.032,
"ks_pvalue": 0.67,
"aic": 1523.4
}
},
"alternative_fits": [
{"distribution": "lognorm", "aic": 1528.1},
{"distribution": "weibull", "aic": 1531.2}
],
"recommendation": "Use gamma(6.1, 0.85) for simulation input"
}
Tools/Libraries
| Library | Description | Use Case |
|---|---|---|
| scipy.stats | Statistical functions | Core fitting |
| fitter | Auto fitting | Quick analysis |
| statsmodels | Advanced stats | Detailed tests |
| R fitdistrplus | R package | Complex fitting |
Best Practices
- Visualize first - Always plot histograms and Q-Q plots
- Consider theory - Choose distributions based on process
- Test multiple - Compare several candidate distributions
- Check tails - Extreme values matter for simulation
- Document choice - Record rationale for selected distribution
- Update periodically - Re-fit as new data becomes available
Constraints
- Report goodness-of-fit statistics, not just parameters
- Document data collection methodology
- Consider censored or truncated data
- Test for time-varying parameters
Related Skills
Attack Tree Construction
Build comprehensive attack trees to visualize threat paths. Use when mapping attack scenarios, identifying defense gaps, or communicating security risks to stakeholders.
Grafana Dashboards
Create and manage production Grafana dashboards for real-time visualization of system and application metrics. Use when building monitoring dashboards, visualizing metrics, or creating operational observability interfaces.
Matplotlib
Foundational plotting library. Create line plots, scatter, bar, histograms, heatmaps, 3D, subplots, export PNG/PDF/SVG, for scientific visualization and publication figures.
Scientific Visualization
Create publication figures with matplotlib/seaborn/plotly. Multi-panel layouts, error bars, significance markers, colorblind-safe, export PDF/EPS/TIFF, for journal-ready scientific plots.
Seaborn
Statistical visualization. Scatter, box, violin, heatmaps, pair plots, regression, correlation matrices, KDE, faceted plots, for exploratory analysis and publication figures.
Shap
Model interpretability and explainability using SHAP (SHapley Additive exPlanations). Use this skill when explaining machine learning model predictions, computing feature importance, generating SHAP plots (waterfall, beeswarm, bar, scatter, force, heatmap), debugging models, analyzing model bias or fairness, comparing models, or implementing explainable AI. Works with tree-based models (XGBoost, LightGBM, Random Forest), deep learning (TensorFlow, PyTorch), linear models, and any black-box model
Pydeseq2
Differential gene expression analysis (Python DESeq2). Identify DE genes from bulk RNA-seq counts, Wald tests, FDR correction, volcano/MA plots, for RNA-seq analysis.
Query Writing
For writing and executing SQL queries - from simple single-table queries to complex multi-table JOINs and aggregations
Pydeseq2
Differential gene expression analysis (Python DESeq2). Identify DE genes from bulk RNA-seq counts, Wald tests, FDR correction, volcano/MA plots, for RNA-seq analysis.
Scientific Visualization
Meta-skill for publication-ready figures. Use when creating journal submission figures requiring multi-panel layouts, significance annotations, error bars, colorblind-safe palettes, and specific journal formatting (Nature, Science, Cell). Orchestrates matplotlib/seaborn/plotly with publication styles. For quick exploration use seaborn or plotly directly.
