MRP in RxInfer
In [1]:
import pandas as pd
import numpy as np
import os
import itertools
import gc
from scipy.optimize import brentq
from transforms import logit, invlogit, rescale
from model_builder import MRPModelBuilder
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import geopandas as gpd
from shapely.geometry import box
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 700
if 'mrp-replication' in os.listdir('.'):
os.chdir('mrp-replication')
run_full_models = True
########################################################################################
### labels
########################################################################################
reg_lkup = [3,4,4,3,4,4,1,1,5,3,3,4,4,2,2,2,2,3,3,1,1,1,2,2,3,2,4,2,4,1,1,4,1,3,2,2,3,4,1,1,3,2,3,3,4,1,3,4,1,2,4]
reg_label = ["Northeast", "Midwest", "South", "West", "DC"]
eth_label = ["White", "Black", "Hispanic", "Other"]
inc_label = ["$0-20k", "$20-40k", "$40-75k", "$75-150k", "$150k+"]
age_label = ["18-29", "30-44", "45-64", "65+"]
n_reg = len(reg_label)
n_stt = 51 # 50 states + DC
n_eth = len(eth_label)
n_inc = len(inc_label)
n_age = len(age_label)
state_abb = [
"AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "DC", "FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME",
"MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI",
"SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"
]
state_name = [
"Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "District of Columbia",
"Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine",
"Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada",
"New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon",
"Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia",
"Washington", "West Virginia", "Wisconsin", "Wyoming"
]
stt_label = state_abb
########################################################################################
### get data
########################################################################################
### state-level data
dat_stt = pd.read_csv("data/state-stats.dat", sep="\t")
dat_stt['z.inc2000'] = rescale(dat_stt['inc2000'])
dat_stt['z.inc2004'] = rescale(dat_stt['inc2004'])
dat_stt['z.inc2007'] = rescale(dat_stt['inc2007'])
dat_stt['z.rep1996'] = rescale(dat_stt['rep1996'])
dat_stt['z.rep2000'] = rescale(dat_stt['rep2000'])
dat_stt['z.rep2004'] = rescale(dat_stt['rep2004'])
dat_stt['z.rep2008'] = rescale(dat_stt['rep2008'])
dat_stt['z.trn1996'] = rescale(dat_stt['vote1996']/dat_stt['pop1996'])
dat_stt['z.trn2000'] = rescale(dat_stt['vote2000']/dat_stt['pop2000'])
dat_stt['z.trn2004'] = rescale(dat_stt['vote2004']/dat_stt['pop2004'])
dat_stt['z.trn2008'] = rescale(dat_stt['vote2008']/dat_stt['pop2007'])
dat_stt['stt'] = np.arange(1, len(dat_stt) + 1) # 1-based index
### CPS data for modeling turnout
dat_cps = pd.read_csv("data/cps2000-04-08-DKs.dat", sep="\t")
cols_to_check = dat_cps.columns[:8]
ok = (dat_cps[cols_to_check].isna().sum(axis=1) == 0) & ((dat_cps['cit'] == 1) | dat_cps['cit'].isna())
dat_cps = dat_cps[ok].copy()
dat_cps['reg'] = dat_cps['stt'].apply(lambda x: reg_lkup[x-1])
year_map = {'2000-cps': 2000, '2004-cps': 2004, '2008-cps': 2008}
dat_cps['year'] = dat_cps['file'].map(year_map)
dat_cps = dat_cps[dat_cps['year'] != 2000].copy()
### annenberg / pew data for modeling vote choice
dat_vot = pd.read_csv("data/votechoice2000-04-08.dat", sep="\t")
dat_vot['weight'] = dat_vot['weight'].fillna(1)
cols_to_check_vot = dat_vot.columns[:8]
ok_vot = (dat_vot[cols_to_check_vot].isna().sum(axis=1) == 0) & \
((dat_vot['cit'] == 1) | dat_vot['cit'].isna()) & \
((dat_vot['regist'] == "registered") | dat_vot['regist'].isna())
dat_vot = dat_vot[ok_vot].copy()
dat_vot['reg'] = dat_vot['stt'].apply(lambda x: reg_lkup[x-1])
year_map_vot = {'2000-ann': 2000, '2004-ann': 2004, '2008-pew': 2008}
dat_vot['year'] = dat_vot['file'].map(year_map_vot)
dat_vot = dat_vot[dat_vot['year'] != 2000].copy()
### census data from PUMS for population cell sizes
dat_pop = pd.read_csv("data/census-pums-pop-2000-04-08.dat", sep="\t")
### prepare data for looping through years
years = [2004, 2008]
L_z_incstt = [dat_stt['z.inc2004'], dat_stt['z.inc2007']]
L_z_repprv = [dat_stt['z.rep2000'], dat_stt['z.rep2004']]
L_z_trnprv = [dat_stt['z.trn2000'], dat_stt['z.trn2004']]
print("Preprocessing complete.", flush=True)
########################################################################################
### multilevel models
########################################################################################
# Create grid D
stt_range = range(n_stt) # 0 to 50
eth_range = range(n_eth) # 0 to 3
inc_range = range(n_inc) # 0 to 4
age_range = range(n_age) # 0 to 3
D = pd.DataFrame(list(itertools.product(stt_range, eth_range, inc_range, age_range)),
columns=["stt", "eth", "inc", "age"])
D['grp'] = D.apply(lambda row: f"{row['stt']}_{row['eth']}_{row['inc']}_{row['age']}", axis=1)
D['ix'] = np.arange(len(D))
reg_lkup_0 = np.array(reg_lkup) - 1
D['reg'] = D['stt'].apply(lambda x: reg_lkup_0[x])
def encode_interaction(df, cols):
return df[cols].apply(lambda row: "_".join(row.values.astype(str)), axis=1).astype('category').cat.codes
interaction_cols = [
['reg', 'eth'], ['reg', 'inc'], ['reg', 'age'],
['stt', 'eth'], ['stt', 'inc'], ['stt', 'age'],
['eth', 'inc'], ['eth', 'age'], ['inc', 'age'],
# 3-way interactions commented out due to OOM issues with Pathfinder
# ['stt', 'eth', 'inc'], ['stt', 'eth', 'age'], ['stt', 'inc', 'age'], ['eth', 'inc', 'age']
]
for cols in interaction_cols:
name = "_".join(cols)
D[name] = encode_interaction(D, cols)
D['z_inc'] = rescale(D['inc'])
def get_model_data(year, data_source, D_grid, target_col):
# Filter by year
tmp = data_source[data_source['year'] == year].copy()
# Align indices (assuming data is 1-based, grid is 0-based)
tmp['stt_0'] = tmp['stt'] - 1
tmp['eth_0'] = tmp['eth'] - 1
tmp['inc_0'] = tmp['inc'] - 1
tmp['age_0'] = tmp['age'] - 1
tmp['grp'] = tmp.apply(lambda row: f"{int(row['stt_0'])}_{int(row['eth_0'])}_{int(row['inc_0'])}_{int(row['age_0'])}", axis=1)
def weighted_mean(x):
if len(x) == 0: return 0
w = x['weight']
v = x[target_col]
return np.sum(v * w) / np.sum(w)
def design_effect(x):
if len(x) <= 1: return 1
w = x['weight']
norm_w = w / w.mean()
return 1 + np.var(norm_w, ddof=1)
grouped = tmp.groupby('grp')
mr = pd.DataFrame({
'n': grouped.size(),
'ybar_wt': grouped.apply(weighted_mean, include_groups=False),
'des_eff_cell': grouped.apply(design_effect, include_groups=False)
})
D_tmp = D_grid.merge(mr, on='grp', how='left')
D_tmp['n'] = D_tmp['n'].fillna(0)
D_tmp['des_eff_cell'] = D_tmp['des_eff_cell'].fillna(1)
mask = D_tmp['n'] > 1
if mask.sum() > 0:
des_eff = np.average(D_tmp.loc[mask, 'des_eff_cell'], weights=D_tmp.loc[mask, 'n'])
else:
des_eff = 1
D_tmp['n_eff'] = D_tmp['n'] / des_eff
D_tmp.loc[D_tmp['n_eff'] == 0, 'ybar_wt'] = 0.5
D_tmp['trials'] = D_tmp['n_eff']
D_tmp['successes'] = D_tmp['ybar_wt'] * D_tmp['n_eff']
D_tmp['trials_int'] = D_tmp['trials'].round().astype(int)
D_tmp['successes_int'] = D_tmp['successes'].round().astype(int)
return D_tmp
L_z_incstt_map = {2004: dat_stt['z.inc2004'].values, 2008: dat_stt['z.inc2007'].values}
L_z_repprv_map = {2004: dat_stt['z.rep2000'].values, 2008: dat_stt['z.rep2004'].values}
L_z_trnprv_map = {2004: dat_stt['z.trn2000'].values, 2008: dat_stt['z.trn2004'].values}
for year in years:
print(f"***** Multilevel Models for {year}", flush=True)
# Update D with year-specific state predictors
z_incstt_vals = L_z_incstt_map[year]
z_repprv_vals = L_z_repprv_map[year]
z_trnprv_vals = L_z_trnprv_map[year]
D['z_incstt'] = D['stt'].apply(lambda x: z_incstt_vals[x])
D['z_repprv'] = D['stt'].apply(lambda x: z_repprv_vals[x])
D['z_trnprv'] = D['stt'].apply(lambda x: z_trnprv_vals[x])
# --- Turnout Model ---
print("***** CPS Turnout Model", flush=True)
data_cps = get_model_data(year, dat_cps, D, 'vote')
# Filter observed
observed_mask = data_cps['trials_int'] > 0
df_obs = data_cps[observed_mask].copy()
# Add interaction columns to df_obs (they are in D, merged in get_model_data)
# get_model_data merges D_grid (which has interactions) with stats.
# So df_obs has them.
# Config
if run_full_models:
turnout_config = {
'fixed_effects': ['z_inc', 'z_incstt', 'z_trnprv'],
'fixed_interactions': [('z_inc', 'z_incstt'), ('z_inc', 'z_trnprv')],
'random_effects': {
'reg': {'idx_col': 'reg', 'n_cats': n_reg, 'slope_col': 'z_inc'},
'stt': {'idx_col': 'stt', 'n_cats': n_stt, 'slope_col': 'z_inc'},
'eth': {'idx_col': 'eth', 'n_cats': n_eth, 'slope_col': 'z_inc'},
'inc': {'idx_col': 'inc', 'n_cats': n_inc, 'slope_col': None},
'age': {'idx_col': 'age', 'n_cats': n_age, 'slope_col': 'z_inc'},
}
}
# Add interactions to random effects
for cols in interaction_cols:
name = "_".join(cols)
n_cats = int(D[name].max() + 1)
turnout_config['random_effects'][name] = {'idx_col': name, 'n_cats': n_cats, 'slope_col': None}
else:
turnout_config = {
'fixed_effects': [],
'random_effects': {
'stt': {'idx_col': 'stt', 'n_cats': n_stt, 'slope_col': None},
'eth': {'idx_col': 'eth', 'n_cats': n_eth, 'slope_col': None},
'inc': {'idx_col': 'inc', 'n_cats': n_inc, 'slope_col': None},
'age': {'idx_col': 'age', 'n_cats': n_age, 'slope_col': None},
}
}
coords = {"obs_id": df_obs.index}
builder = MRPModelBuilder(df_obs, coords, turnout_config)
print("Building Turnout Model...", flush=True)
builder.build_model()
print("Fitting Turnout Model...", flush=True)
builder.fit(method="pathfinder", num_paths=10, num_draws=200, random_seed=42)
print("Predicting Turnout...", flush=True)
D[f'turn{year}_M'] = builder.predict(D)
print("Turnout Prediction complete.", flush=True)
# --- Vote Choice Model ---
print("***** Annenberg/Pew Vote Choice Model", flush=True)
data_vot = get_model_data(year, dat_vot, D, 'rvote')
observed_mask_vot = data_vot['trials_int'] > 0
df_obs_vot = data_vot[observed_mask_vot].copy()
if run_full_models:
vote_config = {
'fixed_effects': ['z_inc', 'z_incstt', 'z_repprv'],
'fixed_interactions': [('z_inc', 'z_incstt'), ('z_inc', 'z_repprv')],
'random_effects': {
'reg': {'idx_col': 'reg', 'n_cats': n_reg, 'slope_col': 'z_inc'},
'stt': {'idx_col': 'stt', 'n_cats': n_stt, 'slope_col': 'z_inc'},
'eth': {'idx_col': 'eth', 'n_cats': n_eth, 'slope_col': 'z_inc'},
'inc': {'idx_col': 'inc', 'n_cats': n_inc, 'slope_col': None},
'age': {'idx_col': 'age', 'n_cats': n_age, 'slope_col': 'z_inc'},
}
}
for cols in interaction_cols:
name = "_".join(cols)
n_cats = int(D[name].max() + 1)
vote_config['random_effects'][name] = {'idx_col': name, 'n_cats': n_cats, 'slope_col': None}
else:
vote_config = {
'fixed_effects': [],
'random_effects': {
'stt': {'idx_col': 'stt', 'n_cats': n_stt, 'slope_col': None},
'eth': {'idx_col': 'eth', 'n_cats': n_eth, 'slope_col': None},
'inc': {'idx_col': 'inc', 'n_cats': n_inc, 'slope_col': None},
'age': {'idx_col': 'age', 'n_cats': n_age, 'slope_col': None},
}
}
coords_vot = {"obs_id": df_obs_vot.index}
builder_vot = MRPModelBuilder(df_obs_vot, coords_vot, vote_config)
print("Building Vote Choice Model...", flush=True)
builder_vot.build_model()
print("Fitting Vote Choice Model...", flush=True)
builder_vot.fit(method="pathfinder", num_paths=10, num_draws=200, random_seed=42)
print("Predicting Vote Choice...")
D[f'vote{year}_M'] = builder_vot.predict(D)
print("Vote Choice Prediction complete.")
del builder, builder_vot, data_cps, data_vot, df_obs, df_obs_vot
gc.collect()
########################################################################################
### poststratification
########################################################################################
print("Starting Poststratification...", flush=True)
# Align indices for dat_pop
dat_pop['stt_0'] = dat_pop['stt'] - 1
dat_pop['eth_0'] = dat_pop['eth'] - 1
dat_pop['inc_0'] = dat_pop['inc'] - 1
dat_pop['age_0'] = dat_pop['age'] - 1
dat_pop['grp'] = dat_pop.apply(lambda row: f"{int(row['stt_0'])}_{int(row['eth_0'])}_{int(row['inc_0'])}_{int(row['age_0'])}", axis=1)
pop_agg = dat_pop.groupby('grp')[['wtd2004', 'wtd2008']].sum().reset_index()
pop_agg.rename(columns={'wtd2004': 'pop2004', 'wtd2008': 'pop2008'}, inplace=True)
D = D.merge(pop_agg, on='grp', how='left')
D['pop2004'] = D['pop2004'].fillna(0)
D['pop2008'] = D['pop2008'].fillna(0)
D = D.sort_values('ix')
### add up and alter based on actual turnout and vote estimates
def correct_weighted(a, w, target_val):
logit_a = logit(a)
def func(delta):
pred = invlogit(logit_a + delta)
return np.sum(pred * w) - target_val
try:
delta_opt = brentq(func, -10, 10)
except ValueError:
if np.sum(w) == 0: return a
try:
delta_opt = brentq(func, -100, 100)
except:
return a
return invlogit(logit_a + delta_opt)
D['turn2004_adj'] = np.nan
D['turn2008_adj'] = np.nan
D['vote2004_adj'] = np.nan
D['vote2008_adj'] = np.nan
print("Starting Calibration...", flush=True)
for i in range(n_stt):
stt_row = dat_stt.iloc[i]
mask = D['stt'] == i
if mask.sum() == 0: continue
# Turnout 2004
if 'turn2004_M' in D.columns:
D.loc[mask, 'turn2004_adj'] = correct_weighted(
D.loc[mask, 'turn2004_M'].values,
D.loc[mask, 'pop2004'].values,
stt_row['vote2004']
)
# Turnout 2008
if 'turn2008_M' in D.columns:
D.loc[mask, 'turn2008_adj'] = correct_weighted(
D.loc[mask, 'turn2008_M'].values,
D.loc[mask, 'pop2008'].values,
stt_row['vote2008']
)
# Vote 2004
if 'vote2004_M' in D.columns:
w_vot_04 = D.loc[mask, 'pop2004'].values
if w_vot_04.sum() > 0:
w_vot_04 = w_vot_04 / w_vot_04.sum()
D.loc[mask, 'vote2004_adj'] = correct_weighted(
D.loc[mask, 'vote2004_M'].values,
w_vot_04,
stt_row['rep2004']
)
# Vote 2008
if 'vote2008_M' in D.columns:
w_vot_08 = D.loc[mask, 'pop2008'].values
if w_vot_08.sum() > 0:
w_vot_08 = w_vot_08 / w_vot_08.sum()
D.loc[mask, 'vote2008_adj'] = correct_weighted(
D.loc[mask, 'vote2008_M'].values,
w_vot_08,
stt_row['rep2008']
)
print("MRP Replication complete.")
Preprocessing complete. ***** Multilevel Models for 2004 ***** CPS Turnout Model Building Turnout Model... Fitting Turnout Model...
Output()
/Users/erik/Projects/pytensor/pytensor/tensor/math.py:3023: RuntimeWarning: divide by zero encountered in matmul output_storage[0][0] = np.matmul(*inputs)
/Users/erik/Projects/pytensor/pytensor/tensor/math.py:3023: RuntimeWarning: overflow encountered in matmul output_storage[0][0] = np.matmul(*inputs)
/Users/erik/Projects/pytensor/pytensor/tensor/math.py:3023: RuntimeWarning: invalid value encountered in matmul output_storage[0][0] = np.matmul(*inputs)
Pathfinder Results
No. model parameters 941
Configuration:
num_draws_per_path 1000
history size (maxcor) 21
max iterations 1000
ftol 1.00e-05
gtol 1.00e-08
max line search 1000
jitter 2.0
epsilon 1.00e-08
ELBO draws 10
LBFGS Status:
CONVERGED 7
INIT_FAILED 3
L-BFGS iterations mean 312 ± std 48
Path Status:
SUCCESS 7
LBFGS_FAILED 3
ELBO argmax mean 100 ± std 37
Importance Sampling:
Method psis
Pareto k 10.50
Timing (seconds):
Compile 5.95
Compute 98.46
Total 104.41
Warnings:
- INIT_FAILED: LBFGS failed to initialize. Consider reparameterizing the model or reducing jitter if this
occurence is high relative to the number of paths.
Output()
Predicting Turnout... Turnout Prediction complete. ***** Annenberg/Pew Vote Choice Model Building Vote Choice Model... Fitting Vote Choice Model...
Output()
/Users/erik/Projects/pytensor/pytensor/tensor/math.py:3023: RuntimeWarning: overflow encountered in matmul output_storage[0][0] = np.matmul(*inputs)
/Users/erik/Projects/pytensor/pytensor/tensor/math.py:3023: RuntimeWarning: invalid value encountered in matmul output_storage[0][0] = np.matmul(*inputs)
Pathfinder Results
No. model parameters 941
Configuration:
num_draws_per_path 1000
history size (maxcor) 21
max iterations 1000
ftol 1.00e-05
gtol 1.00e-08
max line search 1000
jitter 2.0
epsilon 1.00e-08
ELBO draws 10
LBFGS Status:
CONVERGED 7
INIT_FAILED 3
L-BFGS iterations mean 329 ± std 51
Path Status:
SUCCESS 7
LBFGS_FAILED 3
ELBO argmax mean 53 ± std 15
Importance Sampling:
Method psis
Pareto k 12.01
Timing (seconds):
Compile 4.20
Compute 108.80
Total 113.00
Warnings:
- INIT_FAILED: LBFGS failed to initialize. Consider reparameterizing the model or reducing jitter if this
occurence is high relative to the number of paths.
Output()
Predicting Vote Choice... Vote Choice Prediction complete. ***** Multilevel Models for 2008 ***** CPS Turnout Model Building Turnout Model... Fitting Turnout Model...
Output()
/Users/erik/Projects/pytensor/pytensor/tensor/math.py:3023: RuntimeWarning: divide by zero encountered in matmul output_storage[0][0] = np.matmul(*inputs)
/Users/erik/Projects/pytensor/pytensor/tensor/math.py:3023: RuntimeWarning: overflow encountered in matmul output_storage[0][0] = np.matmul(*inputs)
/Users/erik/Projects/pytensor/pytensor/tensor/math.py:3023: RuntimeWarning: invalid value encountered in matmul output_storage[0][0] = np.matmul(*inputs)
Pathfinder Results
No. model parameters 941
Configuration:
num_draws_per_path 1000
history size (maxcor) 21
max iterations 1000
ftol 1.00e-05
gtol 1.00e-08
max line search 1000
jitter 2.0
epsilon 1.00e-08
ELBO draws 10
LBFGS Status:
CONVERGED 7
INIT_FAILED 3
L-BFGS iterations mean 284 ± std 43
Path Status:
SUCCESS 7
LBFGS_FAILED 3
ELBO argmax mean 79 ± std 22
Importance Sampling:
Method psis
Pareto k 9.31
Timing (seconds):
Compile 2.81
Compute 81.21
Total 84.02
Warnings:
- INIT_FAILED: LBFGS failed to initialize. Consider reparameterizing the model or reducing jitter if this
occurence is high relative to the number of paths.
Output()
Predicting Turnout... Turnout Prediction complete. ***** Annenberg/Pew Vote Choice Model Building Vote Choice Model... Fitting Vote Choice Model...
Output()
/Users/erik/Projects/pytensor/pytensor/tensor/math.py:3023: RuntimeWarning: divide by zero encountered in matmul output_storage[0][0] = np.matmul(*inputs)
/Users/erik/Projects/pytensor/pytensor/tensor/math.py:3023: RuntimeWarning: overflow encountered in matmul output_storage[0][0] = np.matmul(*inputs)
/Users/erik/Projects/pytensor/pytensor/tensor/math.py:3023: RuntimeWarning: invalid value encountered in matmul output_storage[0][0] = np.matmul(*inputs)
Pathfinder Results
No. model parameters 941
Configuration:
num_draws_per_path 1000
history size (maxcor) 21
max iterations 1000
ftol 1.00e-05
gtol 1.00e-08
max line search 1000
jitter 2.0
epsilon 1.00e-08
ELBO draws 10
LBFGS Status:
CONVERGED 8
INIT_FAILED 2
L-BFGS iterations mean 294 ± std 46
Path Status:
SUCCESS 8
LBFGS_FAILED 2
ELBO argmax mean 39 ± std 19
Importance Sampling:
Method psis
Pareto k 11.80
Timing (seconds):
Compile 2.94
Compute 94.99
Total 97.94
Warnings:
- INIT_FAILED: LBFGS failed to initialize. Consider reparameterizing the model or reducing jitter if this
occurence is high relative to the number of paths.
Output()
Predicting Vote Choice... Vote Choice Prediction complete. Starting Poststratification... Starting Poststratification... Starting Calibration... MRP Replication complete.
In [4]:
def color_alpha(colors, alpha=1.0):
# colors can be a list of hex or names
rgba = mcolors.to_rgba_array(colors)
rgba[:, 3] = alpha
return rgba
def color_groups(x, x_seq, color1, color2, color3, alpha=1.0):
# x is a series or array
x = np.array(x)
x = np.round(x)
x = np.clip(x, x_seq[0], x_seq[1])
# Normalize to 0-1 for colormap
norm = mcolors.Normalize(vmin=x_seq[0], vmax=x_seq[1])
cmap = mcolors.LinearSegmentedColormap.from_list("custom", [color1, color2, color3])
return cmap(norm(x), alpha=alpha)
def plot_bubble(x, y, sz, bg="white", fg="black", title="", xlab="", ylab="", xlim=(0,1), ylim=(0,1), ax=None):
if ax is None:
fig, ax = plt.subplots()
# Sort by size descending
idx = np.argsort(sz)[::-1]
x = np.array(x)[idx]
y = np.array(y)[idx]
sz = np.array(sz)[idx]
if isinstance(bg, (list, np.ndarray)):
if len(bg) == len(x): bg = np.array(bg)[idx]
if isinstance(fg, (list, np.ndarray)):
if len(fg) == len(x): fg = np.array(fg)[idx]
ax.scatter(x, y, s=sz*15000, c=bg, edgecolors=fg, linewidth=0.5)
ax.set_xlim(xlim)
ax.set_ylim(ylim)
ax.set_title(title)
ax.set_xlabel(xlab)
ax.set_ylabel(ylab)
# Add grid lines
ax.axvline(np.mean(xlim), color='grey', linestyle=':', linewidth=0.5)
ax.axhline(np.mean(ylim), color='grey', linestyle=':', linewidth=0.5)
return ax
_us_states_gdf = None
def get_us_map():
global _us_states_gdf
if _us_states_gdf is None:
# Use a reliable GeoJSON source for US states
url = "https://eric.clst.org/assets/wiki/uploads/Stuff/gz_2010_us_040_00_500k.json"
try:
gdf = gpd.read_file(url)
_us_states_gdf = gdf
except Exception as e:
print(f"Error loading map: {e}")
return None
return _us_states_gdf.copy()
def plot_map(ax, colors, title=""):
# colors: array-like of colors (strings or RGBA), aligned with state_name (0-50)
gdf = get_us_map()
if gdf is None:
ax.text(0.5, 0.5, "Map Error", ha='center')
return
# Map colors to gdf
# Create a dict mapping NAME to color
name_to_color = {}
for idx, name in enumerate(state_name):
if idx < len(colors):
name_to_color[name] = colors[idx]
gdf['color'] = gdf['NAME'].map(name_to_color).fillna('lightgrey')
# Reproject to Albers Equal Area for better shape
try:
gdf = gdf.to_crs("EPSG:5070")
except:
pass
# Separate AK, HI, and Lower 48
ak = gdf[gdf['NAME'] == 'Alaska']
hi = gdf[gdf['NAME'] == 'Hawaii']
lower48 = gdf[~gdf['NAME'].isin(['Alaska', 'Hawaii', 'Puerto Rico'])]
# Plot Lower 48
lower48.plot(ax=ax, color=lower48['color'], edgecolor='white', linewidth=0.5)
# Inset for Alaska
if not ak.empty:
ax_ak = ax.inset_axes([0.0, 0.0, 0.25, 0.25]) # Bottom left
ak.plot(ax=ax_ak, color=ak['color'], edgecolor='white', linewidth=0.5)
ax_ak.axis('off')
# Zoom to AK
minx, miny, maxx, maxy = ak.total_bounds
ax_ak.set_xlim(minx, maxx)
ax_ak.set_ylim(miny, maxy)
# Inset for Hawaii
if not hi.empty:
ax_hi = ax.inset_axes([0.25, 0.0, 0.15, 0.15]) # Next to AK
hi.plot(ax=ax_hi, color=hi['color'], edgecolor='white', linewidth=0.5)
ax_hi.axis('off')
# Zoom to HI
minx, miny, maxx, maxy = hi.total_bounds
ax_hi.set_xlim(minx, maxx)
ax_hi.set_ylim(miny, maxy)
ax.set_title(title, fontsize=10)
ax.axis('off')
In [5]:
print("Generating Figure 1...")
# Subset data
mask_fig1 = (dat_vot['year'] == 2008) & (dat_vot['eth'] == 1)
df_fig1 = dat_vot[mask_fig1].copy()
df_fig1['stt_idx'] = df_fig1['stt'] - 1
df_fig1['trials_int'] = 1
df_fig1['successes_int'] = df_fig1['rvote'].astype(int)
# M1: rvote ~ inc + (1 | stt)
config_m1 = {
'fixed_effects': ['inc'],
'random_effects': {
'stt': {'idx_col': 'stt_idx', 'n_cats': n_stt, 'slope_col': None}
}
}
coords_fig1 = {"obs_id": df_fig1.index}
builder_m1 = MRPModelBuilder(df_fig1, coords_fig1, config_m1)
builder_m1.build_model()
builder_m1.fit(method="mcmc", draws=50, tune=50, chains=1, progressbar=False)
# M2: rvote ~ inc + (1 + inc | stt)
config_m2 = {
'fixed_effects': ['inc'],
'random_effects': {
'stt': {'idx_col': 'stt_idx', 'n_cats': n_stt, 'slope_col': 'inc'}
}
}
builder_m2 = MRPModelBuilder(df_fig1, coords_fig1, config_m2)
builder_m2.build_model()
builder_m2.fit(method="mcmc", draws=50, tune=50, chains=1, progressbar=False)
# Predictions
dat_pc = pd.DataFrame(list(itertools.product(range(n_stt), range(1, 6))), columns=['stt_idx', 'inc'])
dat_pc['stt'] = dat_pc['stt_idx'] + 1
dat_pc['M1'] = builder_m1.predict(dat_pc)
dat_pc['M2'] = builder_m2.predict(dat_pc)
def get_raw_mean(row):
mask = (df_fig1['stt_idx'] == row['stt_idx']) & (df_fig1['inc'] == row['inc'])
if mask.sum() == 0: return np.nan
return np.average(df_fig1.loc[mask, 'rvote'], weights=df_fig1.loc[mask, 'weight'])
dat_pc['raw'] = dat_pc.apply(get_raw_mean, axis=1)
# Plotting
fig1, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=True)
titles = ["Raw Values", "Income coefficient\nconsistent across states", "Income coefficient\nvarying by state"]
data_cols = ['raw', 'M1', 'M2']
rep2008 = dat_stt['rep2008'].values
stt_colors = color_groups(rep2008 * 100, [25, 75], "darkblue", "grey", "darkred")
bold_states = ["MS", "OH", "CT"]
bold_indices = [state_abb.index(s) for s in bold_states]
for i, ax in enumerate(axes):
ax.set_title(titles[i])
ax.set_xticks(range(1, 6))
ax.set_xticklabels(inc_label, rotation=90)
ax.set_ylim(0, 1)
for stt_idx in range(n_stt):
if stt_idx in bold_indices: continue
subset = dat_pc[dat_pc['stt_idx'] == stt_idx]
ax.plot(subset['inc'], subset[data_cols[i]], color=stt_colors[stt_idx], linewidth=0.5, alpha=0.7)
for stt_idx in range(n_stt):
if stt_idx not in bold_indices: continue
subset = dat_pc[dat_pc['stt_idx'] == stt_idx]
ax.plot(subset['inc'], subset[data_cols[i]], color=stt_colors[stt_idx], linewidth=2)
ax.text(0.9, subset[subset['inc']==1][data_cols[i]].values[0], state_abb[stt_idx], ha='right', fontsize=8)
ax.text(5.1, subset[subset['inc']==5][data_cols[i]].values[0], state_abb[stt_idx], ha='left', fontsize=8)
plt.tight_layout()
plt.savefig("figs/1.png")
print("Figure 1 saved.")
Only 50 samples per chain. Reliable r-hat and ESS diagnostics require longer chains for accurate estimate.
Generating Figure 1...
Initializing NUTS using jitter+adapt_diag... Sequential sampling (1 chains in 1 job) NUTS: [b_0, b_inc, stt::sigma, stt::raw] Sampling 1 chain for 50 tune and 50 draw iterations (50 + 50 draws total) took 13 seconds. The number of samples is too small to check convergence reliably. Only 50 samples per chain. Reliable r-hat and ESS diagnostics require longer chains for accurate estimate. Initializing NUTS using jitter+adapt_diag... Sequential sampling (1 chains in 1 job) NUTS: [b_0, b_inc, stt::sigma, stt::raw, stt::sigma_slope, stt::raw_slope] Sampling 1 chain for 50 tune and 50 draw iterations (50 + 50 draws total) took 27 seconds. The number of samples is too small to check convergence reliably.
Figure 1 saved.
In [6]:
print("Generating Figure 2...")
# Aggregations
D['plot_grp'] = D.apply(lambda row: f"{row['stt']}_{row['inc']}", axis=1)
D2 = D.groupby('plot_grp').apply(
lambda x: pd.Series({
'stt': x['stt'].iloc[0],
'inc': x['inc'].iloc[0],
'vote2008': np.sum(x['vote2008_adj'] * x['pop2008']) / np.sum(x['pop2008'])
}), include_groups=False
).reset_index()
D3 = D[D['eth'] == 0].groupby('plot_grp').apply( # eth 0 is White
lambda x: pd.Series({
'stt': x['stt'].iloc[0],
'inc': x['inc'].iloc[0],
'vote2008': np.sum(x['vote2008_adj'] * x['pop2008']) / np.sum(x['pop2008'])
}), include_groups=False
).reset_index()
stt_inc_grid = pd.DataFrame(list(itertools.product(range(n_stt), range(n_inc))), columns=['stt', 'inc'])
stt_inc_grid['fit1'] = np.nan
stt_inc_grid['se1'] = np.nan
stt_inc_grid['mu1'] = np.nan
stt_inc_grid['fit2'] = np.nan
stt_inc_grid['se2'] = np.nan
stt_inc_grid['mu2'] = np.nan
# We need to compute weighted means and SEs from dat_vot
# dat_vot uses 1-based stt and inc.
# stt_inc_grid uses 0-based.
for i, row in stt_inc_grid.iterrows():
s, inc = int(row['stt']), int(row['inc'])
# All voters
mask = (dat_vot['stt'] == s + 1) & (dat_vot['inc'] == inc + 1) & (dat_vot['year'] == 2008)
if mask.sum() > 1:
# Model fit
fit_val = D2.loc[(D2['stt'] == s) & (D2['inc'] == inc), 'vote2008']
if not fit_val.empty:
stt_inc_grid.at[i, 'fit1'] = fit_val.values[0]
# Raw data
w = dat_vot.loc[mask, 'weight']
v = dat_vot.loc[mask, 'rvote']
mu = np.average(v, weights=w)
mu = min(0.98, max(0.02, mu))
se = np.sqrt((mu * (1 - mu)) / mask.sum())
stt_inc_grid.at[i, 'mu1'] = mu
stt_inc_grid.at[i, 'se1'] = se
# White voters
mask_w = mask & (dat_vot['eth'] == 1)
if mask_w.sum() > 1:
fit_val = D3.loc[(D3['stt'] == s) & (D3['inc'] == inc), 'vote2008']
if not fit_val.empty:
stt_inc_grid.at[i, 'fit2'] = fit_val.values[0]
w = dat_vot.loc[mask_w, 'weight']
v = dat_vot.loc[mask_w, 'rvote']
mu = np.average(v, weights=w)
mu = min(0.98, max(0.02, mu))
se = np.sqrt((mu * (1 - mu)) / mask_w.sum())
stt_inc_grid.at[i, 'mu2'] = mu
stt_inc_grid.at[i, 'se2'] = se
# Plotting Fig 2
fig2 = plt.figure(figsize=(10, 10))
sort_state = np.argsort(dat_stt['rep2008'].values[:n_stt])[::-1] # Descending
count = 0
rows, cols = 7, 7
for idx in sort_state:
if state_abb[idx] in ["AK", "HI", "DC"]: continue
count += 1
ax = fig2.add_subplot(rows, cols, count)
# Axis labels only on edges
if count % cols == 1:
ax.set_yticks([0.02, 0.5, 0.95])
ax.set_yticklabels(["0%", "50%", "100%"], fontsize=8)
else:
ax.set_yticks([])
if count > (48 - cols):
ax.set_xticks([0, 2, 4])
ax.set_xticklabels(["poor", "mid", "rich"], fontsize=8)
else:
ax.set_xticks([])
ax.axhline(0.5, color='gray', linewidth=0.5)
ax.set_ylim(0, 1)
ax.set_xlim(-0.5, 4.5)
tmp = stt_inc_grid[stt_inc_grid['stt'] == idx]
# All voters (gray)
ax.errorbar(tmp['inc'], tmp['mu1'], yerr=tmp['se1'], fmt='o', color='gray', markersize=3, elinewidth=1)
ax.plot(tmp['inc'], tmp['fit1'], color='gray', linewidth=1)
# White voters (orange)
ax.errorbar(tmp['inc'] + 0.1, tmp['mu2'], yerr=tmp['se2'], fmt='o', color='darkorange', markersize=3, elinewidth=1)
ax.plot(tmp['inc'], tmp['fit2'], color='darkorange', linewidth=1)
ax.text(2, 0.9, state_name[idx], ha='center', fontsize=9)
plt.tight_layout()
plt.savefig("figs/2.png")
print("Figure 2 saved.")
Generating Figure 2... Figure 2 saved.
In [7]:
print("Generating Figure 3...")
fig3, axes = plt.subplots(1, 3, figsize=(18, 6))
# 1. State x Eth
D['grp_plot'] = D.apply(lambda row: f"{row['stt']}_{row['eth']}", axis=1)
D_plot = D.groupby('grp_plot').apply(
lambda x: pd.Series({
'eth': x['eth'].iloc[0],
'vote2008': np.sum(x['vote2008_adj'] * x['turn2008_adj'] * x['pop2008']) / np.sum(x['turn2008_adj'] * x['pop2008']),
'turn2008': np.sum(x['turn2008_adj'] * x['pop2008']) / np.sum(x['pop2008']),
'pop2008': np.sum(x['pop2008'])
}), include_groups=False
).reset_index()
eth_colors = ['white', 'black', 'red', 'green']
bg_colors = [eth_colors[int(e)] for e in D_plot['eth']]
fg_colors = ['grey'] * len(D_plot)
plot_bubble(D_plot['vote2008'], D_plot['turn2008'], np.sqrt(D_plot['pop2008'])/15000,
bg=bg_colors, fg=fg_colors, title="State x Ethnicity",
xlab="McCain Vote 2008", ylab="Turnout 2008", ax=axes[0])
# 2. State x Eth x Inc
D['grp_plot'] = D.apply(lambda row: f"{row['stt']}_{row['eth']}_{row['inc']}", axis=1)
D_plot = D.groupby('grp_plot').apply(
lambda x: pd.Series({
'eth': x['eth'].iloc[0],
'vote2008': np.sum(x['vote2008_adj'] * x['turn2008_adj'] * x['pop2008']) / np.sum(x['turn2008_adj'] * x['pop2008']),
'turn2008': np.sum(x['turn2008_adj'] * x['pop2008']) / np.sum(x['pop2008']),
'pop2008': np.sum(x['pop2008'])
}), include_groups=False
).reset_index()
bg_colors = [eth_colors[int(e)] for e in D_plot['eth']]
fg_colors = ['grey'] * len(D_plot)
plot_bubble(D_plot['vote2008'], D_plot['turn2008'], np.sqrt(D_plot['pop2008'])/15000,
bg=bg_colors, fg=fg_colors, title="State x Ethnicity x Income",
xlab="McCain Vote 2008", ylab="", ax=axes[1])
# 3. State x Eth x Inc x Age
D['grp_plot'] = D.apply(lambda row: f"{row['stt']}_{row['eth']}_{row['inc']}_{row['age']}", axis=1)
D_plot = D.groupby('grp_plot').apply(
lambda x: pd.Series({
'eth': x['eth'].iloc[0],
'vote2008': np.sum(x['vote2008_adj'] * x['turn2008_adj'] * x['pop2008']) / np.sum(x['turn2008_adj'] * x['pop2008']),
'turn2008': np.sum(x['turn2008_adj'] * x['pop2008']) / np.sum(x['pop2008']),
'pop2008': np.sum(x['pop2008'])
}), include_groups=False
).reset_index()
bg_colors = [eth_colors[int(e)] for e in D_plot['eth']]
fg_colors = ['grey'] * len(D_plot)
plot_bubble(D_plot['vote2008'], D_plot['turn2008'], np.sqrt(D_plot['pop2008'])/15000,
bg=bg_colors, fg=fg_colors, title="State x Ethnicity x Income x Age",
xlab="McCain Vote 2008", ylab="", ax=axes[2])
plt.tight_layout()
plt.savefig("figs/3.png")
print("Figure 3 saved.")
Generating Figure 3...
/var/folders/6h/4989_pyx21d34x6ywybxzydc0000gn/T/ipykernel_34372/427786064.py:29: RuntimeWarning: invalid value encountered in scalar divide 'vote2008': np.sum(x['vote2008_adj'] * x['turn2008_adj'] * x['pop2008']) / np.sum(x['turn2008_adj'] * x['pop2008']), /var/folders/6h/4989_pyx21d34x6ywybxzydc0000gn/T/ipykernel_34372/427786064.py:30: RuntimeWarning: invalid value encountered in scalar divide 'turn2008': np.sum(x['turn2008_adj'] * x['pop2008']) / np.sum(x['pop2008']), /var/folders/6h/4989_pyx21d34x6ywybxzydc0000gn/T/ipykernel_34372/427786064.py:46: RuntimeWarning: invalid value encountered in scalar divide 'vote2008': np.sum(x['vote2008_adj'] * x['turn2008_adj'] * x['pop2008']) / np.sum(x['turn2008_adj'] * x['pop2008']), /var/folders/6h/4989_pyx21d34x6ywybxzydc0000gn/T/ipykernel_34372/427786064.py:47: RuntimeWarning: invalid value encountered in scalar divide 'turn2008': np.sum(x['turn2008_adj'] * x['pop2008']) / np.sum(x['pop2008']),
Figure 3 saved.
In [8]:
print("Generating Figure 4...")
# Prepare data
ok_grp = np.zeros(len(D), dtype=bool)
for i in range(n_stt):
mask = D['stt'] == i
tmp = D.loc[mask, 'pop2008'] * D.loc[mask, 'turn2008_adj']
threshold = tmp.sum() / 100
ok_grp[mask] = (tmp >= threshold)
fig4, axes = plt.subplots(n_inc, n_age, figsize=(15, 10))
fig4.suptitle("McCain 2008 minus Bush 2004 among whites", fontsize=16)
for i_age in range(n_age):
for i_inc in range(n_inc):
ax = axes[i_inc, i_age]
# Filter
mask = (D['age'] == i_age) & (D['inc'] == i_inc) & (D['eth'] == 0) # eth 0 is White
subset = D[mask].copy()
# Ensure we have data for all states in order
# subset might be missing states if they were filtered out earlier, but D is a grid.
# However, D was created with itertools, so it should be complete.
# We sort by stt to be sure.
subset = subset.sort_values('stt')
if len(subset) != n_stt:
# Should not happen given D construction, but safety check
subset = subset.set_index('stt').reindex(range(n_stt))
# Calculate values
vals = (subset['vote2008_adj'] - subset['vote2004_adj']) * 100
# Get colors
colors = color_groups(vals, [-25, 25], "darkblue", "white", "darkred", alpha=0.75)
# Apply mask for small groups
# ok_grp is aligned with D. We need it for this subset.
# Since subset is sorted by stt and filtered by age/inc/eth,
# we need to extract corresponding ok_grp values.
# We can use the index of subset (which comes from D) to index ok_grp.
subset_indices = subset.index
grp_mask = ok_grp[subset_indices]
# Set alpha to 0 for masked
colors[~grp_mask] = [0, 0, 0, 0]
plot_map(ax, colors, title=f"Age {age_label[i_age]}\nInc {inc_label[i_inc]}")
plt.tight_layout()
plt.savefig("figs/4.png")
print("Figure 4 saved.")
Generating Figure 4... Figure 4 saved.
In [9]:
print("Generating Figure 5...")
agg_mean = np.sum(D['pop2008'] * D['turn2008_adj']) / np.sum(D['pop2008']) - \
np.sum(D['pop2004'] * D['turn2004_adj']) / np.sum(D['pop2004'])
fig5, axes = plt.subplots(n_age, n_eth, figsize=(12, 12), sharex=True, sharey=True)
fig5.suptitle("Turnout Swing (08-04)", fontsize=16)
for i_eth in range(n_eth):
for i_age in range(n_age):
ax = axes[i_age, i_eth]
mask = (D['eth'] == i_eth) & (D['age'] == i_age)
subset = D[mask]
# Aggregate by income
res = []
for i_inc in range(n_inc):
mask2 = mask & (D['inc'] == i_inc)
sub2 = D[mask2]
p04 = sub2['pop2004'].sum()
t04 = np.sum(sub2['pop2004'] * sub2['turn2004_adj']) / p04 if p04 > 0 else 0
p08 = sub2['pop2008'].sum()
t08 = np.sum(sub2['pop2008'] * sub2['turn2008_adj']) / p08 if p08 > 0 else 0
res.append([p04, t04, p08, t08])
res = np.array(res)
res[:, 0] = res[:, 0] / res[:, 0].sum() / 2 # pop04
res[:, 2] = res[:, 2] / res[:, 2].sum() / 2 # pop08
grp_mean = np.sum(subset['pop2008'] * subset['turn2008_adj']) / np.sum(subset['pop2008']) - \
np.sum(subset['pop2004'] * subset['turn2004_adj']) / np.sum(subset['pop2004'])
col1 = "lightblue" if grp_mean > 0.05 else "lightgrey"
col2 = "darkblue" if grp_mean > 0.05 else "black"
x = np.arange(n_inc)
ax.bar(x, res[:, 2], color=col1, width=0.8)
swing = res[:, 3] - res[:, 1]
ax.plot(x, swing, 'o-', color=col2, markersize=4)
ax.axhline(0, linestyle=':', color='black')
ax.axhline(agg_mean, linestyle=':', color='red')
ax.set_ylim(-0.05, 0.2)
if i_age == 0:
ax.set_title(eth_label[i_eth])
if i_eth == 0:
ax.set_ylabel(age_label[i_age])
if i_age == 3:
ax.set_xticks(x)
ax.set_xticklabels(inc_label, rotation=90)
plt.tight_layout()
plt.savefig("figs/5.png")
print("Figure 5 saved.")
Generating Figure 5... Figure 5 saved.
In [10]:
print("Generating Figure 6...")
D['plot_grp'] = D.apply(lambda row: f"{row['stt']}_{row['eth']}_{row['age']}", axis=1)
D3 = D.groupby('plot_grp').apply(
lambda x: pd.Series({
'stt': x['stt'].iloc[0],
'eth': x['eth'].iloc[0],
'age': x['age'].iloc[0],
'pop2008': np.sum(x['pop2008']),
'turn2008': np.sum(x['pop2008'] * x['turn2008_adj']) / np.sum(x['pop2008'])
}), include_groups=False
).reset_index()
ok_grp = np.zeros(len(D3), dtype=bool)
for i in range(n_stt):
mask = D3['stt'] == i
tmp = D3.loc[mask, 'pop2008']
threshold = tmp.sum() / 100
ok_grp[mask] = (tmp >= threshold)
from matplotlib.colors import LinearSegmentedColormap
fig6, axes = plt.subplots(n_age, n_eth, figsize=(15, 12))
fig6.suptitle("Turnout among Young Whites and Minorities Over-Emphasized", fontsize=16)
dark_red = [0.5, 0.0, 0.0]
white = [1.0, 1.0, 1.0]
dark_green = [0.0, 0.5, 0.0]
color_nodes = [
(0.0, dark_red), # Start at 0.0 (vmin) with dark_red
(0.6, white), # Pass through white at the 0.6 point
(1.0, dark_green) # End at 1.0 (vmax) with dark_green
]
custom_cmap = LinearSegmentedColormap.from_list("RedWhiteGreen", color_nodes)
for i_age in range(n_age):
for i_eth in range(n_eth):
ax = axes[i_age, i_eth]
mask = (D3['age'] == i_age) & (D3['eth'] == i_eth)
subset = D3[mask].copy()
subset = subset.sort_values('stt')
if len(subset) != n_stt:
subset = subset.set_index('stt').reindex(range(n_stt))
vals = subset['turn2008']
colors = color_groups(vals, [0.4, 0.8], "darkred", "white", "green", alpha=0.75)
# Apply mask
subset_indices = subset.index
grp_mask = ok_grp[subset_indices]
colors[~grp_mask] = [0, 0, 0, 0]
rplot = plot_map(ax, colors, title="")
ax.axis("on")
ax.tick_params(left=False, bottom=False, labelleft=False, labelbottom=False)
for spine in ax.spines.values():
spine.set_visible(False)
if i_eth == 0:
ax.set_ylabel(age_label[i_age], loc="center", fontsize=12)
# Apply X-label only to the last row (i_age == n_age - 1)
if i_age == 0:
ax.set_title(eth_label[i_eth], y=1.0, fontsize=12)
# ax.set_xlabel(eth_label[i_eth], loc="center")
# ax.xaxis.set_label_position('top')
norm = mpl.colors.Normalize(vmin=np.min(colors[colors != 0]), vmax=np.max(colors))
sm = plt.cm.ScalarMappable(cmap=custom_cmap, norm=norm)
sm.set_array([])
cbar = fig6.colorbar(sm, ax=ax)
cbar.set_label('Turnout', rotation=270, labelpad=15)
plt.tight_layout()
plt.savefig("figs/6.png")
print("Figure 6 saved.")
Generating Figure 6...
/var/folders/6h/4989_pyx21d34x6ywybxzydc0000gn/T/ipykernel_34372/1415949281.py:10: RuntimeWarning: invalid value encountered in scalar divide 'turn2008': np.sum(x['pop2008'] * x['turn2008_adj']) / np.sum(x['pop2008'])
Figure 6 saved.
In [11]:
year = 2008
z_incstt_vals = L_z_incstt_map[year]
z_repprv_vals = L_z_repprv_map[year]
z_trnprv_vals = L_z_trnprv_map[year]
D['z_incstt'] = D['stt'].apply(lambda x: z_incstt_vals[x])
D['z_repprv'] = D['stt'].apply(lambda x: z_repprv_vals[x])
D['z_trnprv'] = D['stt'].apply(lambda x: z_trnprv_vals[x])
# --- Turnout Model ---
print("***** CPS Turnout Model", flush=True)
data_cps = get_model_data(year, dat_cps, D, 'vote')
# Filter observed
observed_mask = data_cps['trials_int'] > 0
df_obs = data_cps[observed_mask].copy()
# Add interaction columns to df_obs (they are in D, merged in get_model_data)
# get_model_data merges D_grid (which has interactions) with stats.
# So df_obs has them.
# Config
if run_full_models:
turnout_config = {
'fixed_effects': ['z_inc', 'z_incstt', 'z_trnprv'],
'fixed_interactions': [('z_inc', 'z_incstt'), ('z_inc', 'z_trnprv')],
'random_effects': {
'reg': {'idx_col': 'reg', 'n_cats': n_reg, 'slope_col': 'z_inc'},
'stt': {'idx_col': 'stt', 'n_cats': n_stt, 'slope_col': 'z_inc'},
'eth': {'idx_col': 'eth', 'n_cats': n_eth, 'slope_col': 'z_inc'},
'inc': {'idx_col': 'inc', 'n_cats': n_inc, 'slope_col': None},
'age': {'idx_col': 'age', 'n_cats': n_age, 'slope_col': 'z_inc'},
}
}
# Add interactions to random effects
for cols in interaction_cols:
name = "_".join(cols)
n_cats = int(D[name].max() + 1)
turnout_config['random_effects'][name] = {'idx_col': name, 'n_cats': n_cats, 'slope_col': None}
else:
turnout_config = {
'fixed_effects': [],
'random_effects': {
'stt': {'idx_col': 'stt', 'n_cats': n_stt, 'slope_col': None},
'eth': {'idx_col': 'eth', 'n_cats': n_eth, 'slope_col': None},
'inc': {'idx_col': 'inc', 'n_cats': n_inc, 'slope_col': None},
'age': {'idx_col': 'age', 'n_cats': n_age, 'slope_col': None},
}
}
coords = {"obs_id": df_obs.index}
builder = MRPModelBuilder(df_obs, coords, turnout_config)
print("Building Turnout Model...", flush=True)
builder.build_model()
print("Fitting Turnout Model...", flush=True)
builder.fit(method="pathfinder", num_paths=10, num_draws=200, random_seed=42)
***** CPS Turnout Model Building Turnout Model... Fitting Turnout Model...
Output()
/Users/erik/Projects/pytensor/pytensor/tensor/math.py:3023: RuntimeWarning: divide by zero encountered in matmul output_storage[0][0] = np.matmul(*inputs)
/Users/erik/Projects/pytensor/pytensor/tensor/math.py:3023: RuntimeWarning: overflow encountered in matmul output_storage[0][0] = np.matmul(*inputs)
/Users/erik/Projects/pytensor/pytensor/tensor/math.py:3023: RuntimeWarning: invalid value encountered in matmul output_storage[0][0] = np.matmul(*inputs)
Pathfinder Results
No. model parameters 941
Configuration:
num_draws_per_path 1000
history size (maxcor) 21
max iterations 1000
ftol 1.00e-05
gtol 1.00e-08
max line search 1000
jitter 2.0
epsilon 1.00e-08
ELBO draws 10
LBFGS Status:
CONVERGED 7
INIT_FAILED 3
L-BFGS iterations mean 284 ± std 43
Path Status:
SUCCESS 7
LBFGS_FAILED 3
ELBO argmax mean 79 ± std 22
Importance Sampling:
Method psis
Pareto k 9.31
Timing (seconds):
Compile 2.88
Compute 83.67
Total 86.55
Warnings:
- INIT_FAILED: LBFGS failed to initialize. Consider reparameterizing the model or reducing jitter if this
occurence is high relative to the number of paths.
Output()
Out[11]:
arviz.InferenceData
-
<xarray.Dataset> Size: 3MB Dimensions: (chain: 1, draw: 200, age::raw_dim_0: 4, age::raw_slope_dim_0: 4, age::re_dim_0: 4, age::re_slope_dim_0: 4, eth::raw_dim_0: 4, eth::raw_slope_dim_0: 4, eth::re_dim_0: 4, eth::re_slope_dim_0: 4, eth_age::raw_dim_0: 16, eth_age::re_dim_0: 16, eth_inc::raw_dim_0: 20, ... reg_inc::re_dim_0: 25, stt::raw_dim_0: 51, stt::raw_slope_dim_0: 51, stt::re_dim_0: 51, stt::re_slope_dim_0: 51, stt_age::raw_dim_0: 204, stt_age::re_dim_0: 204, stt_eth::raw_dim_0: 204, stt_eth::re_dim_0: 204, stt_inc::raw_dim_0: 255, stt_inc::re_dim_0: 255) Coordinates: (12/38) * chain (chain) int64 8B 0 * draw (draw) int64 2kB 0 1 2 3 4 5 ... 195 196 197 198 199 * age::raw_dim_0 (age::raw_dim_0) int64 32B 0 1 2 3 * age::raw_slope_dim_0 (age::raw_slope_dim_0) int64 32B 0 1 2 3 * age::re_dim_0 (age::re_dim_0) int64 32B 0 1 2 3 * age::re_slope_dim_0 (age::re_slope_dim_0) int64 32B 0 1 2 3 ... ... * stt_age::raw_dim_0 (stt_age::raw_dim_0) int64 2kB 0 1 2 3 ... 201 202 203 * stt_age::re_dim_0 (stt_age::re_dim_0) int64 2kB 0 1 2 3 ... 201 202 203 * stt_eth::raw_dim_0 (stt_eth::raw_dim_0) int64 2kB 0 1 2 3 ... 201 202 203 * stt_eth::re_dim_0 (stt_eth::re_dim_0) int64 2kB 0 1 2 3 ... 201 202 203 * stt_inc::raw_dim_0 (stt_inc::raw_dim_0) int64 2kB 0 1 2 3 ... 252 253 254 * stt_inc::re_dim_0 (stt_inc::re_dim_0) int64 2kB 0 1 2 3 ... 252 253 254 Data variables: (12/60) age::raw (chain, draw, age::raw_dim_0) float64 6kB -0.8859 .... age::raw_slope (chain, draw, age::raw_slope_dim_0) float64 6kB -1.... age::re (chain, draw, age::re_dim_0) float64 6kB -0.13 ... ... age::re_slope (chain, draw, age::re_slope_dim_0) float64 6kB -0.1... age::sigma (chain, draw) float64 2kB 0.1467 0.1487 ... 0.1354 age::sigma_slope (chain, draw) float64 2kB 0.1318 0.1466 ... 0.1118 ... ... stt_eth::raw (chain, draw, stt_eth::raw_dim_0) float64 326kB -0.... stt_eth::re (chain, draw, stt_eth::re_dim_0) float64 326kB -0.0... stt_eth::sigma (chain, draw) float64 2kB 0.01946 0.02484 ... 0.01923 stt_inc::raw (chain, draw, stt_inc::raw_dim_0) float64 408kB 0.0... stt_inc::re (chain, draw, stt_inc::re_dim_0) float64 408kB 0.00... stt_inc::sigma (chain, draw) float64 2kB 0.0077 0.01054 ... 0.004776 Attributes: created_at: 2025-12-05T12:59:15.097280+00:00 arviz_version: 0.22.0 -
<xarray.Dataset> Size: 50kB Dimensions: (obs_id: 3127) Coordinates: * obs_id (obs_id) int64 25kB 0 1 2 3 4 5 6 ... 4070 4071 4073 4074 4077 4078 Data variables: y_obs (obs_id) int64 25kB 4 6 15 23 17 12 19 38 10 ... 1 2 1 6 1 1 3 1 1 Attributes: created_at: 2025-12-05T12:59:15.167498+00:00 arviz_version: 0.22.0 inference_library: pymc inference_library_version: 5.26.1 -
<xarray.Dataset> Size: 100kB Dimensions: (obs_id: 3127) Coordinates: * obs_id (obs_id) int64 25kB 0 1 2 3 4 5 ... 4070 4071 4073 4074 4077 4078 Data variables: z_inc (obs_id) float64 25kB -0.707 -0.707 -0.707 ... 0.3535 0.707 0.707 z_incstt (obs_id) float64 25kB -0.6491 -0.6491 -0.6491 ... -0.1106 -0.1106 z_trnprv (obs_id) float64 25kB -0.4547 -0.4547 -0.4547 ... 0.1959 0.1959 Attributes: created_at: 2025-12-05T12:59:15.168989+00:00 arviz_version: 0.22.0 inference_library: pymc inference_library_version: 5.26.1 -
<xarray.Dataset> Size: 85kB Dimensions: (status: 12, path: 7, param: 941) Coordinates: * status (status) <U26 1kB 'CONVERGED' ... 'SUCCESS' * path (path) int64 56B 0 1 2 3 4 5 6 * param (param) <U22 83kB 'b_0' ... 'inc_age::raw[19]' Data variables: (12/35) compile_time float64 8B 2.876 compute_time float64 8B 83.67 total_time float64 8B 86.55 importance_sampling_method <U4 16B 'psis' pareto_k float64 8B 9.31 lbfgs_status_counts (status) float64 96B 7.0 nan 3.0 ... 0.0 nan nan ... ... config/gtol float64 8B 1e-08 config/maxls int64 8B 1000 config/jitter float64 8B 2.0 config/epsilon float64 8B 1e-08 config/num_elbo_draws int64 8B 10 config/num_retries int64 8B 0 Attributes: warnings: ['INIT_FAILED: LBFGS failed to initialize. Consider reparamete...
In [ ]: