This implementation uses Iterative Proportional Fitting to create a synthetic population that matches target marginal distributions for age, gender, and education.
The IPF process:
A full explanation of IPF
import numpy as np
import pandas as pd
from itertools import product
def ipf_synthetic_population():
# 1. Define target marginal distributions
marginals = {
'age': {'18-25': 500, '26-35': 1000},
'gender': {'Male': 600, 'Female': 900},
'education': {'School': 800, 'Degree': 700}
}
# 2. Create all possible combinations of categories
categories = {
'age': list(marginals['age'].keys()),
'gender': list(marginals['gender'].keys()),
'education': list(marginals['education'].keys())
}
# Generate all possible combinations
combinations = list(product(*categories.values()))
# 3. Initialize DataFrame with uniform weights
df = pd.DataFrame(combinations, columns=categories.keys())
df['weight'] = 1 # Uniform initial weights
# 4. IPF Algorithm
max_iterations = 50
tolerance = 1e-6
converged = False
for iteration in range(max_iterations):
max_diff = 0
# Adjust for each dimension
for dimension in categories:
# Calculate current marginal sums
current_margins = df.groupby(dimension)['weight'].sum()
# Calculate adjustment factors
adjustments = {}
for category in marginals[dimension]:
target = marginals[dimension][category]
current = current_margins.get(category, 0)
if current > 0:
adjustments[category] = target / current
else:
adjustments[category] = 1.0 # No adjustment if no samples
# Apply adjustments
df['weight'] *= df[dimension].map(adjustments)
# Track maximum adjustment for convergence check
current_diff = max(abs(1 - x) for x in adjustments.values())
max_diff = max(max_diff, current_diff)
# Check for convergence
if max_diff < tolerance:
print(f"Converged after {iteration+1} iterations")
converged = True
break
if not converged:
print(f"Stopped after {max_iterations} iterations (max reached)")
# 5. Generate synthetic population
population_size = 1500
# Normalize weights to probabilities
df['probability'] = df['weight'] / df['weight'].sum()
# Sample individuals according to probabilities
synthetic_pop = df.sample(
n=population_size,
weights='probability',
replace=True
).reset_index(drop=True)
# Add unique IDs
synthetic_pop['id'] = range(1, len(synthetic_pop)+1)
# Return only needed columns
return synthetic_pop[['id', 'age', 'gender', 'education']]
# Generate and verify the population
synthetic_pop = ipf_synthetic_population()
print("\nSynthetic population head:")
print(synthetic_pop.head())
print("\nMarginal verification:")
print("Age counts:\n", synthetic_pop['age'].value_counts())
print("\nGender counts:\n", synthetic_pop['gender'].value_counts())
print("\nEducation counts:\n", synthetic_pop['education'].value_counts())
Target distributions for each dimension (age, gender, education).
Method | Strengths | Weaknesses |
---|---|---|
IPF | Mathematically rigorous, preserves structure, efficient | Assumes positive interactions, may flatten rare combinations |
Simulated Annealing | Handles complex constraints, avoids local optima | Computationally intensive, requires parameter tuning |
Deterministic Reweighting | Simple, fast | Only matches one dimension at a time, distorts joints |
IPF is often the preferred choice when you need to match multiple marginals simultaneously while preserving the joint distribution structure from your sample data.
A worked example [ongoing development] is avalable in this Colab notebook