This implementation uses Simulated Annealing to create a synthetic population that matches target marginal distributions for age, gender, and education.
A version that uses parallel simulated annealing with shared memory can be found here
The algorithm:
import numpy as np
import pandas as pd
import random
import math
# Target marginal distributions
target_margins = {
'age': {'18-25': 500, '26-35': 1000},
'gender': {'Male': 600, 'Female': 900},
'education': {'School': 800, 'Degree': 700}
}
# All possible categories
categories = {
'age': ['18-25', '26-35'],
'gender': ['Male', 'Female'],
'education': ['School', 'Degree']
}
# Parameters for simulated annealing
initial_temp = 1000
cooling_rate = 0.95
iterations_per_temp = 100
min_temp = 1
def calculate_energy(population):
"""Calculate how far current margins are from targets"""
energy = 0
for dim in target_margins:
counts = population[dim].value_counts().to_dict()
for cat in target_margins[dim]:
observed = counts.get(cat, 0)
target = target_margins[dim][cat]
energy += (observed - target)**2
return energy
def generate_initial_population(size=1500):
"""Create random initial population"""
records = []
for _ in range(size):
record = {
'age': random.choice(categories['age']),
'gender': random.choice(categories['gender']),
'education': random.choice(categories['education'])
}
records.append(record)
return pd.DataFrame(records)
def mutate_population(population):
"""Make a small random change to the population"""
new_pop = population.copy()
idx = random.randint(0, len(new_pop)-1)
dim = random.choice(list(categories.keys()))
new_val = random.choice(categories[dim])
new_pop.at[idx, dim] = new_val
return new_pop
def simulated_annealing():
"""Main optimization routine"""
current_pop = generate_initial_population()
current_energy = calculate_energy(current_pop)
temp = initial_temp
best_pop = current_pop.copy()
best_energy = current_energy
while temp > min_temp:
for _ in range(iterations_per_temp):
# Generate neighbor solution
new_pop = mutate_population(current_pop)
new_energy = calculate_energy(new_pop)
# Calculate energy difference
delta_energy = new_energy - current_energy
# Decide whether to accept the new solution
if delta_energy < 0 or random.random() < math.exp(-delta_energy/temp):
current_pop = new_pop
current_energy = new_energy
# Keep track of best solution found
if current_energy < best_energy:
best_pop = current_pop.copy()
best_energy = current_energy
# Cool the system
temp *= cooling_rate
# Add IDs to the best population found
best_pop['id'] = range(1, len(best_pop)+1)
return best_pop[['id', 'age', 'gender', 'education']]
# Run the optimization
synthetic_pop = simulated_annealing()
# Verification
print("Synthetic population head:")
print(synthetic_pop.head())
print("\nMarginal verification:")
print("Age counts:\n", synthetic_pop['age'].value_counts())
print("\nGender counts:\n", synthetic_pop['gender'].value_counts())
print("\nEducation counts:\n", synthetic_pop['education'].value_counts())
Measures how well current population matches target margins (sum of squared differences).
Randomly changes one attribute of one individual to create neighbor solutions.
Controls exploration vs. exploitation with temperature parameter that decreases over time.
Note: May require parameter tuning (temperature schedule, cooling rate) for optimal performance.