In [1]:

import pandas as pd
import numpy as np
import os
import itertools
import gc
from scipy.optimize import brentq
from transforms import logit, invlogit, rescale
from model_builder import MRPModelBuilder

import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import geopandas as gpd
from shapely.geometry import box
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 700

if 'mrp-replication' in os.listdir('.'):
    os.chdir('mrp-replication')

run_full_models = True

########################################################################################
### labels
########################################################################################

reg_lkup = [3,4,4,3,4,4,1,1,5,3,3,4,4,2,2,2,2,3,3,1,1,1,2,2,3,2,4,2,4,1,1,4,1,3,2,2,3,4,1,1,3,2,3,3,4,1,3,4,1,2,4]
reg_label = ["Northeast", "Midwest", "South", "West", "DC"]
eth_label = ["White", "Black", "Hispanic", "Other"]
inc_label = ["$0-20k", "$20-40k", "$40-75k", "$75-150k", "$150k+"]
age_label = ["18-29", "30-44", "45-64", "65+"]

n_reg = len(reg_label)
n_stt = 51 # 50 states + DC
n_eth = len(eth_label)
n_inc = len(inc_label)
n_age = len(age_label)

state_abb = [
    "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "DC", "FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME",
    "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI",
    "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"
]
state_name = [
    "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "District of Columbia",
    "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine",
    "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada",
    "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon",
    "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia",
    "Washington", "West Virginia", "Wisconsin", "Wyoming"
]
stt_label = state_abb

########################################################################################
### get data
########################################################################################

### state-level data
dat_stt = pd.read_csv("data/state-stats.dat", sep="\t")
dat_stt['z.inc2000'] = rescale(dat_stt['inc2000'])
dat_stt['z.inc2004'] = rescale(dat_stt['inc2004'])
dat_stt['z.inc2007'] = rescale(dat_stt['inc2007'])
dat_stt['z.rep1996'] = rescale(dat_stt['rep1996'])
dat_stt['z.rep2000'] = rescale(dat_stt['rep2000'])
dat_stt['z.rep2004'] = rescale(dat_stt['rep2004'])
dat_stt['z.rep2008'] = rescale(dat_stt['rep2008'])
dat_stt['z.trn1996'] = rescale(dat_stt['vote1996']/dat_stt['pop1996'])
dat_stt['z.trn2000'] = rescale(dat_stt['vote2000']/dat_stt['pop2000'])
dat_stt['z.trn2004'] = rescale(dat_stt['vote2004']/dat_stt['pop2004'])
dat_stt['z.trn2008'] = rescale(dat_stt['vote2008']/dat_stt['pop2007'])
dat_stt['stt'] = np.arange(1, len(dat_stt) + 1) # 1-based index

### CPS data for modeling turnout
dat_cps = pd.read_csv("data/cps2000-04-08-DKs.dat", sep="\t")
cols_to_check = dat_cps.columns[:8]
ok = (dat_cps[cols_to_check].isna().sum(axis=1) == 0) & ((dat_cps['cit'] == 1) | dat_cps['cit'].isna())
dat_cps = dat_cps[ok].copy()
dat_cps['reg'] = dat_cps['stt'].apply(lambda x: reg_lkup[x-1])
year_map = {'2000-cps': 2000, '2004-cps': 2004, '2008-cps': 2008}
dat_cps['year'] = dat_cps['file'].map(year_map)
dat_cps = dat_cps[dat_cps['year'] != 2000].copy()

### annenberg / pew data for modeling vote choice
dat_vot = pd.read_csv("data/votechoice2000-04-08.dat", sep="\t")
dat_vot['weight'] = dat_vot['weight'].fillna(1)
cols_to_check_vot = dat_vot.columns[:8]
ok_vot = (dat_vot[cols_to_check_vot].isna().sum(axis=1) == 0) & \
         ((dat_vot['cit'] == 1) | dat_vot['cit'].isna()) & \
         ((dat_vot['regist'] == "registered") | dat_vot['regist'].isna())
dat_vot = dat_vot[ok_vot].copy()
dat_vot['reg'] = dat_vot['stt'].apply(lambda x: reg_lkup[x-1])
year_map_vot = {'2000-ann': 2000, '2004-ann': 2004, '2008-pew': 2008}
dat_vot['year'] = dat_vot['file'].map(year_map_vot)
dat_vot = dat_vot[dat_vot['year'] != 2000].copy()

### census data from PUMS for population cell sizes
dat_pop = pd.read_csv("data/census-pums-pop-2000-04-08.dat", sep="\t")

### prepare data for looping through years
years = [2004, 2008]
L_z_incstt = [dat_stt['z.inc2004'], dat_stt['z.inc2007']]
L_z_repprv = [dat_stt['z.rep2000'], dat_stt['z.rep2004']]
L_z_trnprv = [dat_stt['z.trn2000'], dat_stt['z.trn2004']]

print("Preprocessing complete.", flush=True)

########################################################################################
### multilevel models
########################################################################################

# Create grid D
stt_range = range(n_stt) # 0 to 50
eth_range = range(n_eth) # 0 to 3
inc_range = range(n_inc) # 0 to 4
age_range = range(n_age) # 0 to 3

D = pd.DataFrame(list(itertools.product(stt_range, eth_range, inc_range, age_range)), 
                 columns=["stt", "eth", "inc", "age"])

D['grp'] = D.apply(lambda row: f"{row['stt']}_{row['eth']}_{row['inc']}_{row['age']}", axis=1)
D['ix'] = np.arange(len(D))

reg_lkup_0 = np.array(reg_lkup) - 1 
D['reg'] = D['stt'].apply(lambda x: reg_lkup_0[x])

def encode_interaction(df, cols):
    return df[cols].apply(lambda row: "_".join(row.values.astype(str)), axis=1).astype('category').cat.codes

interaction_cols = [
    ['reg', 'eth'], ['reg', 'inc'], ['reg', 'age'],
    ['stt', 'eth'], ['stt', 'inc'], ['stt', 'age'],
    ['eth', 'inc'], ['eth', 'age'], ['inc', 'age'],
    # 3-way interactions commented out due to OOM issues with Pathfinder
    # ['stt', 'eth', 'inc'], ['stt', 'eth', 'age'], ['stt', 'inc', 'age'], ['eth', 'inc', 'age']
]

for cols in interaction_cols:
    name = "_".join(cols)
    D[name] = encode_interaction(D, cols)

D['z_inc'] = rescale(D['inc'])

def get_model_data(year, data_source, D_grid, target_col):
    # Filter by year
    tmp = data_source[data_source['year'] == year].copy()
    
    # Align indices (assuming data is 1-based, grid is 0-based)
    tmp['stt_0'] = tmp['stt'] - 1
    tmp['eth_0'] = tmp['eth'] - 1
    tmp['inc_0'] = tmp['inc'] - 1
    tmp['age_0'] = tmp['age'] - 1
    
    tmp['grp'] = tmp.apply(lambda row: f"{int(row['stt_0'])}_{int(row['eth_0'])}_{int(row['inc_0'])}_{int(row['age_0'])}", axis=1)
    
    def weighted_mean(x):
        if len(x) == 0: return 0
        w = x['weight']
        v = x[target_col]
        return np.sum(v * w) / np.sum(w)

    def design_effect(x):
        if len(x) <= 1: return 1
        w = x['weight']
        norm_w = w / w.mean()
        return 1 + np.var(norm_w, ddof=1)

    grouped = tmp.groupby('grp')
    mr = pd.DataFrame({
        'n': grouped.size(),
        'ybar_wt': grouped.apply(weighted_mean, include_groups=False),
        'des_eff_cell': grouped.apply(design_effect, include_groups=False)
    })
    
    D_tmp = D_grid.merge(mr, on='grp', how='left')
    D_tmp['n'] = D_tmp['n'].fillna(0)
    D_tmp['des_eff_cell'] = D_tmp['des_eff_cell'].fillna(1)
    
    mask = D_tmp['n'] > 1
    if mask.sum() > 0:
        des_eff = np.average(D_tmp.loc[mask, 'des_eff_cell'], weights=D_tmp.loc[mask, 'n'])
    else:
        des_eff = 1
        
    D_tmp['n_eff'] = D_tmp['n'] / des_eff
    D_tmp.loc[D_tmp['n_eff'] == 0, 'ybar_wt'] = 0.5
    
    D_tmp['trials'] = D_tmp['n_eff']
    D_tmp['successes'] = D_tmp['ybar_wt'] * D_tmp['n_eff']
    
    D_tmp['trials_int'] = D_tmp['trials'].round().astype(int)
    D_tmp['successes_int'] = D_tmp['successes'].round().astype(int)
    
    return D_tmp

L_z_incstt_map = {2004: dat_stt['z.inc2004'].values, 2008: dat_stt['z.inc2007'].values}
L_z_repprv_map = {2004: dat_stt['z.rep2000'].values, 2008: dat_stt['z.rep2004'].values}
L_z_trnprv_map = {2004: dat_stt['z.trn2000'].values, 2008: dat_stt['z.trn2004'].values}

for year in years:
    print(f"***** Multilevel Models for {year}", flush=True)
    
    # Update D with year-specific state predictors
    z_incstt_vals = L_z_incstt_map[year]
    z_repprv_vals = L_z_repprv_map[year]
    z_trnprv_vals = L_z_trnprv_map[year]
    
    D['z_incstt'] = D['stt'].apply(lambda x: z_incstt_vals[x])
    D['z_repprv'] = D['stt'].apply(lambda x: z_repprv_vals[x])
    D['z_trnprv'] = D['stt'].apply(lambda x: z_trnprv_vals[x])
    
    # --- Turnout Model ---
    print("*****   CPS Turnout Model", flush=True)
    data_cps = get_model_data(year, dat_cps, D, 'vote')
    
    # Filter observed
    observed_mask = data_cps['trials_int'] > 0
    df_obs = data_cps[observed_mask].copy()
    
    # Add interaction columns to df_obs (they are in D, merged in get_model_data)
    # get_model_data merges D_grid (which has interactions) with stats.
    # So df_obs has them.
    
    # Config
    if run_full_models:
        turnout_config = {
            'fixed_effects': ['z_inc', 'z_incstt', 'z_trnprv'],
            'fixed_interactions': [('z_inc', 'z_incstt'), ('z_inc', 'z_trnprv')],
            'random_effects': {
                'reg': {'idx_col': 'reg', 'n_cats': n_reg, 'slope_col': 'z_inc'},
                'stt': {'idx_col': 'stt', 'n_cats': n_stt, 'slope_col': 'z_inc'},
                'eth': {'idx_col': 'eth', 'n_cats': n_eth, 'slope_col': 'z_inc'},
                'inc': {'idx_col': 'inc', 'n_cats': n_inc, 'slope_col': None},
                'age': {'idx_col': 'age', 'n_cats': n_age, 'slope_col': 'z_inc'},
            }
        }
        
        # Add interactions to random effects
        for cols in interaction_cols:
            name = "_".join(cols)
            n_cats = int(D[name].max() + 1)
            turnout_config['random_effects'][name] = {'idx_col': name, 'n_cats': n_cats, 'slope_col': None}
    else:
        turnout_config = {
            'fixed_effects': [],
            'random_effects': {
                'stt': {'idx_col': 'stt', 'n_cats': n_stt, 'slope_col': None},
                'eth': {'idx_col': 'eth', 'n_cats': n_eth, 'slope_col': None},
                'inc': {'idx_col': 'inc', 'n_cats': n_inc, 'slope_col': None},
                'age': {'idx_col': 'age', 'n_cats': n_age, 'slope_col': None},
            }
        }
        
    coords = {"obs_id": df_obs.index}
    
    builder = MRPModelBuilder(df_obs, coords, turnout_config)
    print("Building Turnout Model...", flush=True)
    builder.build_model()
    print("Fitting Turnout Model...", flush=True)
    builder.fit(method="pathfinder", num_paths=10, num_draws=200, random_seed=42)
    
    print("Predicting Turnout...", flush=True)
    D[f'turn{year}_M'] = builder.predict(D)
    print("Turnout Prediction complete.", flush=True)
    
    # --- Vote Choice Model ---
    print("*****   Annenberg/Pew Vote Choice Model", flush=True)
    data_vot = get_model_data(year, dat_vot, D, 'rvote')
    observed_mask_vot = data_vot['trials_int'] > 0
    df_obs_vot = data_vot[observed_mask_vot].copy()
    
    if run_full_models:
        vote_config = {
            'fixed_effects': ['z_inc', 'z_incstt', 'z_repprv'],
            'fixed_interactions': [('z_inc', 'z_incstt'), ('z_inc', 'z_repprv')],
            'random_effects': {
                'reg': {'idx_col': 'reg', 'n_cats': n_reg, 'slope_col': 'z_inc'},
                'stt': {'idx_col': 'stt', 'n_cats': n_stt, 'slope_col': 'z_inc'},
                'eth': {'idx_col': 'eth', 'n_cats': n_eth, 'slope_col': 'z_inc'},
                'inc': {'idx_col': 'inc', 'n_cats': n_inc, 'slope_col': None},
                'age': {'idx_col': 'age', 'n_cats': n_age, 'slope_col': 'z_inc'},
            }
        }
        
        for cols in interaction_cols:
            name = "_".join(cols)
            n_cats = int(D[name].max() + 1)
            vote_config['random_effects'][name] = {'idx_col': name, 'n_cats': n_cats, 'slope_col': None}
    else:
        vote_config = {
            'fixed_effects': [],
            'random_effects': {
                'stt': {'idx_col': 'stt', 'n_cats': n_stt, 'slope_col': None},
                'eth': {'idx_col': 'eth', 'n_cats': n_eth, 'slope_col': None},
                'inc': {'idx_col': 'inc', 'n_cats': n_inc, 'slope_col': None},
                'age': {'idx_col': 'age', 'n_cats': n_age, 'slope_col': None},
            }
        }
        
    coords_vot = {"obs_id": df_obs_vot.index}
    
    builder_vot = MRPModelBuilder(df_obs_vot, coords_vot, vote_config)
    print("Building Vote Choice Model...", flush=True)
    builder_vot.build_model()
    print("Fitting Vote Choice Model...", flush=True)
    builder_vot.fit(method="pathfinder", num_paths=10, num_draws=200, random_seed=42)
    
    print("Predicting Vote Choice...")
    D[f'vote{year}_M'] = builder_vot.predict(D)
    print("Vote Choice Prediction complete.")
    
    del builder, builder_vot, data_cps, data_vot, df_obs, df_obs_vot
    gc.collect()

########################################################################################
### poststratification
########################################################################################

print("Starting Poststratification...", flush=True)

# Align indices for dat_pop
dat_pop['stt_0'] = dat_pop['stt'] - 1
dat_pop['eth_0'] = dat_pop['eth'] - 1
dat_pop['inc_0'] = dat_pop['inc'] - 1
dat_pop['age_0'] = dat_pop['age'] - 1

dat_pop['grp'] = dat_pop.apply(lambda row: f"{int(row['stt_0'])}_{int(row['eth_0'])}_{int(row['inc_0'])}_{int(row['age_0'])}", axis=1)

pop_agg = dat_pop.groupby('grp')[['wtd2004', 'wtd2008']].sum().reset_index()
pop_agg.rename(columns={'wtd2004': 'pop2004', 'wtd2008': 'pop2008'}, inplace=True)

D = D.merge(pop_agg, on='grp', how='left')
D['pop2004'] = D['pop2004'].fillna(0)
D['pop2008'] = D['pop2008'].fillna(0)

D = D.sort_values('ix')

### add up and alter based on actual turnout and vote estimates

def correct_weighted(a, w, target_val):
    logit_a = logit(a)
    def func(delta):
        pred = invlogit(logit_a + delta)
        return np.sum(pred * w) - target_val
            
    try:
        delta_opt = brentq(func, -10, 10)
    except ValueError:
        if np.sum(w) == 0: return a
        try:
            delta_opt = brentq(func, -100, 100)
        except:
            return a
    return invlogit(logit_a + delta_opt)

D['turn2004_adj'] = np.nan
D['turn2008_adj'] = np.nan
D['vote2004_adj'] = np.nan
D['vote2008_adj'] = np.nan

print("Starting Calibration...", flush=True)

for i in range(n_stt):
    stt_row = dat_stt.iloc[i]
    mask = D['stt'] == i
    
    if mask.sum() == 0: continue
    
    # Turnout 2004
    if 'turn2004_M' in D.columns:
        D.loc[mask, 'turn2004_adj'] = correct_weighted(
            D.loc[mask, 'turn2004_M'].values,
            D.loc[mask, 'pop2004'].values,
            stt_row['vote2004']
        )
    
    # Turnout 2008
    if 'turn2008_M' in D.columns:
        D.loc[mask, 'turn2008_adj'] = correct_weighted(
            D.loc[mask, 'turn2008_M'].values,
            D.loc[mask, 'pop2008'].values,
            stt_row['vote2008']
        )
    
    # Vote 2004
    if 'vote2004_M' in D.columns:
        w_vot_04 = D.loc[mask, 'pop2004'].values
        if w_vot_04.sum() > 0:
            w_vot_04 = w_vot_04 / w_vot_04.sum()
            
        D.loc[mask, 'vote2004_adj'] = correct_weighted(
            D.loc[mask, 'vote2004_M'].values,
            w_vot_04,
            stt_row['rep2004']
        )
    
    # Vote 2008
    if 'vote2008_M' in D.columns:
        w_vot_08 = D.loc[mask, 'pop2008'].values
        if w_vot_08.sum() > 0:
            w_vot_08 = w_vot_08 / w_vot_08.sum()
            
        D.loc[mask, 'vote2008_adj'] = correct_weighted(
            D.loc[mask, 'vote2008_M'].values,
            w_vot_08,
            stt_row['rep2008']
        )

print("MRP Replication complete.")

Preprocessing complete.
***** Multilevel Models for 2004
*****   CPS Turnout Model
Building Turnout Model...
Fitting Turnout Model...

Output()

/Users/erik/Projects/pytensor/pytensor/tensor/math.py:3023: RuntimeWarning: divide by zero encountered in matmul
  output_storage[0][0] = np.matmul(*inputs)

/Users/erik/Projects/pytensor/pytensor/tensor/math.py:3023: RuntimeWarning: overflow encountered in matmul
  output_storage[0][0] = np.matmul(*inputs)

/Users/erik/Projects/pytensor/pytensor/tensor/math.py:3023: RuntimeWarning: invalid value encountered in matmul
  output_storage[0][0] = np.matmul(*inputs)

Pathfinder Results                            
                                              
  No. model parameters     941                
                                              
  Configuration:                              
  num_draws_per_path       1000               
  history size (maxcor)    21                 
  max iterations           1000               
  ftol                     1.00e-05           
  gtol                     1.00e-08           
  max line search          1000               
  jitter                   2.0                
  epsilon                  1.00e-08           
  ELBO draws               10                 
                                              
  LBFGS Status:                               
  CONVERGED                7                  
  INIT_FAILED              3                  
  L-BFGS iterations        mean 312 ± std 48  
                                              
  Path Status:                                
  SUCCESS                  7                  
  LBFGS_FAILED             3                  
  ELBO argmax              mean 100 ± std 37  
                                              
  Importance Sampling:                        
  Method                   psis               
  Pareto k                 10.50              
                                              
  Timing (seconds):                           
  Compile                  5.95               
  Compute                  98.46              
  Total                    104.41             

Warnings:
  - INIT_FAILED: LBFGS failed to initialize. Consider reparameterizing the model or reducing jitter if this        
  occurence is high relative to the number of paths.

Output()

Predicting Turnout...
Turnout Prediction complete.
*****   Annenberg/Pew Vote Choice Model
Building Vote Choice Model...
Fitting Vote Choice Model...

Output()

/Users/erik/Projects/pytensor/pytensor/tensor/math.py:3023: RuntimeWarning: overflow encountered in matmul
  output_storage[0][0] = np.matmul(*inputs)

/Users/erik/Projects/pytensor/pytensor/tensor/math.py:3023: RuntimeWarning: invalid value encountered in matmul
  output_storage[0][0] = np.matmul(*inputs)

Pathfinder Results                            
                                              
  No. model parameters     941                
                                              
  Configuration:                              
  num_draws_per_path       1000               
  history size (maxcor)    21                 
  max iterations           1000               
  ftol                     1.00e-05           
  gtol                     1.00e-08           
  max line search          1000               
  jitter                   2.0                
  epsilon                  1.00e-08           
  ELBO draws               10                 
                                              
  LBFGS Status:                               
  CONVERGED                7                  
  INIT_FAILED              3                  
  L-BFGS iterations        mean 329 ± std 51  
                                              
  Path Status:                                
  SUCCESS                  7                  
  LBFGS_FAILED             3                  
  ELBO argmax              mean 53 ± std 15   
                                              
  Importance Sampling:                        
  Method                   psis               
  Pareto k                 12.01              
                                              
  Timing (seconds):                           
  Compile                  4.20               
  Compute                  108.80             
  Total                    113.00             

Warnings:
  - INIT_FAILED: LBFGS failed to initialize. Consider reparameterizing the model or reducing jitter if this        
  occurence is high relative to the number of paths.

Output()

Predicting Vote Choice...
Vote Choice Prediction complete.
***** Multilevel Models for 2008
*****   CPS Turnout Model
Building Turnout Model...
Fitting Turnout Model...

Output()

/Users/erik/Projects/pytensor/pytensor/tensor/math.py:3023: RuntimeWarning: divide by zero encountered in matmul
  output_storage[0][0] = np.matmul(*inputs)

/Users/erik/Projects/pytensor/pytensor/tensor/math.py:3023: RuntimeWarning: overflow encountered in matmul
  output_storage[0][0] = np.matmul(*inputs)

/Users/erik/Projects/pytensor/pytensor/tensor/math.py:3023: RuntimeWarning: invalid value encountered in matmul
  output_storage[0][0] = np.matmul(*inputs)

Pathfinder Results                            
                                              
  No. model parameters     941                
                                              
  Configuration:                              
  num_draws_per_path       1000               
  history size (maxcor)    21                 
  max iterations           1000               
  ftol                     1.00e-05           
  gtol                     1.00e-08           
  max line search          1000               
  jitter                   2.0                
  epsilon                  1.00e-08           
  ELBO draws               10                 
                                              
  LBFGS Status:                               
  CONVERGED                7                  
  INIT_FAILED              3                  
  L-BFGS iterations        mean 284 ± std 43  
                                              
  Path Status:                                
  SUCCESS                  7                  
  LBFGS_FAILED             3                  
  ELBO argmax              mean 79 ± std 22   
                                              
  Importance Sampling:                        
  Method                   psis               
  Pareto k                 9.31               
                                              
  Timing (seconds):                           
  Compile                  2.81               
  Compute                  81.21              
  Total                    84.02              

Warnings:
  - INIT_FAILED: LBFGS failed to initialize. Consider reparameterizing the model or reducing jitter if this        
  occurence is high relative to the number of paths.

Output()

Predicting Turnout...
Turnout Prediction complete.
*****   Annenberg/Pew Vote Choice Model
Building Vote Choice Model...
Fitting Vote Choice Model...

Output()

/Users/erik/Projects/pytensor/pytensor/tensor/math.py:3023: RuntimeWarning: divide by zero encountered in matmul
  output_storage[0][0] = np.matmul(*inputs)

/Users/erik/Projects/pytensor/pytensor/tensor/math.py:3023: RuntimeWarning: overflow encountered in matmul
  output_storage[0][0] = np.matmul(*inputs)

/Users/erik/Projects/pytensor/pytensor/tensor/math.py:3023: RuntimeWarning: invalid value encountered in matmul
  output_storage[0][0] = np.matmul(*inputs)

Pathfinder Results                            
                                              
  No. model parameters     941                
                                              
  Configuration:                              
  num_draws_per_path       1000               
  history size (maxcor)    21                 
  max iterations           1000               
  ftol                     1.00e-05           
  gtol                     1.00e-08           
  max line search          1000               
  jitter                   2.0                
  epsilon                  1.00e-08           
  ELBO draws               10                 
                                              
  LBFGS Status:                               
  CONVERGED                8                  
  INIT_FAILED              2                  
  L-BFGS iterations        mean 294 ± std 46  
                                              
  Path Status:                                
  SUCCESS                  8                  
  LBFGS_FAILED             2                  
  ELBO argmax              mean 39 ± std 19   
                                              
  Importance Sampling:                        
  Method                   psis               
  Pareto k                 11.80              
                                              
  Timing (seconds):                           
  Compile                  2.94               
  Compute                  94.99              
  Total                    97.94              

Warnings:
  - INIT_FAILED: LBFGS failed to initialize. Consider reparameterizing the model or reducing jitter if this        
  occurence is high relative to the number of paths.

Output()

Predicting Vote Choice...
Vote Choice Prediction complete.
Starting Poststratification...
Starting Poststratification...
Starting Calibration...
MRP Replication complete.

In [4]:

def color_alpha(colors, alpha=1.0):
    # colors can be a list of hex or names
    rgba = mcolors.to_rgba_array(colors)
    rgba[:, 3] = alpha
    return rgba

def color_groups(x, x_seq, color1, color2, color3, alpha=1.0):
    # x is a series or array
    x = np.array(x)
    x = np.round(x)
    x = np.clip(x, x_seq[0], x_seq[1])
    # Normalize to 0-1 for colormap
    norm = mcolors.Normalize(vmin=x_seq[0], vmax=x_seq[1])
    cmap = mcolors.LinearSegmentedColormap.from_list("custom", [color1, color2, color3])
    return cmap(norm(x), alpha=alpha)

def plot_bubble(x, y, sz, bg="white", fg="black", title="", xlab="", ylab="", xlim=(0,1), ylim=(0,1), ax=None):
    if ax is None:
        fig, ax = plt.subplots()
    
    # Sort by size descending
    idx = np.argsort(sz)[::-1]
    x = np.array(x)[idx]
    y = np.array(y)[idx]
    sz = np.array(sz)[idx]
    
    if isinstance(bg, (list, np.ndarray)):
        if len(bg) == len(x): bg = np.array(bg)[idx]
    if isinstance(fg, (list, np.ndarray)):
        if len(fg) == len(x): fg = np.array(fg)[idx]
        
    ax.scatter(x, y, s=sz*15000, c=bg, edgecolors=fg, linewidth=0.5)
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)
    ax.set_title(title)
    ax.set_xlabel(xlab)
    ax.set_ylabel(ylab)
    # Add grid lines
    ax.axvline(np.mean(xlim), color='grey', linestyle=':', linewidth=0.5)
    ax.axhline(np.mean(ylim), color='grey', linestyle=':', linewidth=0.5)
    return ax

_us_states_gdf = None

def get_us_map():
    global _us_states_gdf
    if _us_states_gdf is None:
        # Use a reliable GeoJSON source for US states
        url = "https://eric.clst.org/assets/wiki/uploads/Stuff/gz_2010_us_040_00_500k.json"
        try:
            gdf = gpd.read_file(url)
            _us_states_gdf = gdf
        except Exception as e:
            print(f"Error loading map: {e}")
            return None
    return _us_states_gdf.copy()

def plot_map(ax, colors, title=""):
    # colors: array-like of colors (strings or RGBA), aligned with state_name (0-50)
    
    gdf = get_us_map()
    if gdf is None:
        ax.text(0.5, 0.5, "Map Error", ha='center')
        return

    # Map colors to gdf
    # Create a dict mapping NAME to color
    name_to_color = {}
    for idx, name in enumerate(state_name):
        if idx < len(colors):
            name_to_color[name] = colors[idx]
            
    gdf['color'] = gdf['NAME'].map(name_to_color).fillna('lightgrey')
    
    # Reproject to Albers Equal Area for better shape
    try:
        gdf = gdf.to_crs("EPSG:5070")
    except:
        pass

    # Separate AK, HI, and Lower 48
    ak = gdf[gdf['NAME'] == 'Alaska']
    hi = gdf[gdf['NAME'] == 'Hawaii']
    lower48 = gdf[~gdf['NAME'].isin(['Alaska', 'Hawaii', 'Puerto Rico'])]
    
    # Plot Lower 48
    lower48.plot(ax=ax, color=lower48['color'], edgecolor='white', linewidth=0.5)
    
    # Inset for Alaska
    if not ak.empty:
        ax_ak = ax.inset_axes([0.0, 0.0, 0.25, 0.25]) # Bottom left
        ak.plot(ax=ax_ak, color=ak['color'], edgecolor='white', linewidth=0.5)
        ax_ak.axis('off')
        # Zoom to AK
        minx, miny, maxx, maxy = ak.total_bounds
        ax_ak.set_xlim(minx, maxx)
        ax_ak.set_ylim(miny, maxy)

    # Inset for Hawaii
    if not hi.empty:
        ax_hi = ax.inset_axes([0.25, 0.0, 0.15, 0.15]) # Next to AK
        hi.plot(ax=ax_hi, color=hi['color'], edgecolor='white', linewidth=0.5)
        ax_hi.axis('off')
        # Zoom to HI
        minx, miny, maxx, maxy = hi.total_bounds
        ax_hi.set_xlim(minx, maxx)
        ax_hi.set_ylim(miny, maxy)

    ax.set_title(title, fontsize=10)
    ax.axis('off')

In [5]:

print("Generating Figure 1...")

# Subset data
mask_fig1 = (dat_vot['year'] == 2008) & (dat_vot['eth'] == 1)
df_fig1 = dat_vot[mask_fig1].copy()
df_fig1['stt_idx'] = df_fig1['stt'] - 1
df_fig1['trials_int'] = 1
df_fig1['successes_int'] = df_fig1['rvote'].astype(int)


# M1: rvote ~ inc + (1 | stt)
config_m1 = {
    'fixed_effects': ['inc'],
    'random_effects': {
        'stt': {'idx_col': 'stt_idx', 'n_cats': n_stt, 'slope_col': None}
    }
}
coords_fig1 = {"obs_id": df_fig1.index}
builder_m1 = MRPModelBuilder(df_fig1, coords_fig1, config_m1)
builder_m1.build_model()
builder_m1.fit(method="mcmc", draws=50, tune=50, chains=1, progressbar=False)

# M2: rvote ~ inc + (1 + inc | stt)
config_m2 = {
    'fixed_effects': ['inc'],
    'random_effects': {
        'stt': {'idx_col': 'stt_idx', 'n_cats': n_stt, 'slope_col': 'inc'}
    }
}
builder_m2 = MRPModelBuilder(df_fig1, coords_fig1, config_m2)
builder_m2.build_model()
builder_m2.fit(method="mcmc", draws=50, tune=50, chains=1, progressbar=False)

# Predictions
dat_pc = pd.DataFrame(list(itertools.product(range(n_stt), range(1, 6))), columns=['stt_idx', 'inc'])
dat_pc['stt'] = dat_pc['stt_idx'] + 1
dat_pc['M1'] = builder_m1.predict(dat_pc)
dat_pc['M2'] = builder_m2.predict(dat_pc)

def get_raw_mean(row):
    mask = (df_fig1['stt_idx'] == row['stt_idx']) & (df_fig1['inc'] == row['inc'])
    if mask.sum() == 0: return np.nan
    return np.average(df_fig1.loc[mask, 'rvote'], weights=df_fig1.loc[mask, 'weight'])

dat_pc['raw'] = dat_pc.apply(get_raw_mean, axis=1)

# Plotting
fig1, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=True)
titles = ["Raw Values", "Income coefficient\nconsistent across states", "Income coefficient\nvarying by state"]
data_cols = ['raw', 'M1', 'M2']

rep2008 = dat_stt['rep2008'].values
stt_colors = color_groups(rep2008 * 100, [25, 75], "darkblue", "grey", "darkred")

bold_states = ["MS", "OH", "CT"]
bold_indices = [state_abb.index(s) for s in bold_states]

for i, ax in enumerate(axes):
    ax.set_title(titles[i])
    ax.set_xticks(range(1, 6))
    ax.set_xticklabels(inc_label, rotation=90)
    ax.set_ylim(0, 1)
    
    for stt_idx in range(n_stt):
        if stt_idx in bold_indices: continue
        subset = dat_pc[dat_pc['stt_idx'] == stt_idx]
        ax.plot(subset['inc'], subset[data_cols[i]], color=stt_colors[stt_idx], linewidth=0.5, alpha=0.7)
        
    for stt_idx in range(n_stt):
        if stt_idx not in bold_indices: continue
        subset = dat_pc[dat_pc['stt_idx'] == stt_idx]
        ax.plot(subset['inc'], subset[data_cols[i]], color=stt_colors[stt_idx], linewidth=2)
        ax.text(0.9, subset[subset['inc']==1][data_cols[i]].values[0], state_abb[stt_idx], ha='right', fontsize=8)
        ax.text(5.1, subset[subset['inc']==5][data_cols[i]].values[0], state_abb[stt_idx], ha='left', fontsize=8)

plt.tight_layout()
plt.savefig("figs/1.png")
print("Figure 1 saved.")

Only 50 samples per chain. Reliable r-hat and ESS diagnostics require longer chains for accurate estimate.

Generating Figure 1...

Initializing NUTS using jitter+adapt_diag...
Sequential sampling (1 chains in 1 job)
NUTS: [b_0, b_inc, stt::sigma, stt::raw]
Sampling 1 chain for 50 tune and 50 draw iterations (50 + 50 draws total) took 13 seconds.
The number of samples is too small to check convergence reliably.
Only 50 samples per chain. Reliable r-hat and ESS diagnostics require longer chains for accurate estimate.
Initializing NUTS using jitter+adapt_diag...
Sequential sampling (1 chains in 1 job)
NUTS: [b_0, b_inc, stt::sigma, stt::raw, stt::sigma_slope, stt::raw_slope]
Sampling 1 chain for 50 tune and 50 draw iterations (50 + 50 draws total) took 27 seconds.
The number of samples is too small to check convergence reliably.

Figure 1 saved.

$No description has been provided for this image$

In [6]:

print("Generating Figure 2...")

# Aggregations
D['plot_grp'] = D.apply(lambda row: f"{row['stt']}_{row['inc']}", axis=1)
D2 = D.groupby('plot_grp').apply(
    lambda x: pd.Series({
        'stt': x['stt'].iloc[0],
        'inc': x['inc'].iloc[0],
        'vote2008': np.sum(x['vote2008_adj'] * x['pop2008']) / np.sum(x['pop2008'])
    }), include_groups=False
).reset_index()

D3 = D[D['eth'] == 0].groupby('plot_grp').apply( # eth 0 is White
    lambda x: pd.Series({
        'stt': x['stt'].iloc[0],
        'inc': x['inc'].iloc[0],
        'vote2008': np.sum(x['vote2008_adj'] * x['pop2008']) / np.sum(x['pop2008'])
    }), include_groups=False
).reset_index()

stt_inc_grid = pd.DataFrame(list(itertools.product(range(n_stt), range(n_inc))), columns=['stt', 'inc'])
stt_inc_grid['fit1'] = np.nan
stt_inc_grid['se1'] = np.nan
stt_inc_grid['mu1'] = np.nan
stt_inc_grid['fit2'] = np.nan
stt_inc_grid['se2'] = np.nan
stt_inc_grid['mu2'] = np.nan

# We need to compute weighted means and SEs from dat_vot
# dat_vot uses 1-based stt and inc.
# stt_inc_grid uses 0-based.

for i, row in stt_inc_grid.iterrows():
    s, inc = int(row['stt']), int(row['inc'])
    
    # All voters
    mask = (dat_vot['stt'] == s + 1) & (dat_vot['inc'] == inc + 1) & (dat_vot['year'] == 2008)
    if mask.sum() > 1:
        # Model fit
        fit_val = D2.loc[(D2['stt'] == s) & (D2['inc'] == inc), 'vote2008']
        if not fit_val.empty:
            stt_inc_grid.at[i, 'fit1'] = fit_val.values[0]
        
        # Raw data
        w = dat_vot.loc[mask, 'weight']
        v = dat_vot.loc[mask, 'rvote']
        mu = np.average(v, weights=w)
        mu = min(0.98, max(0.02, mu))
        se = np.sqrt((mu * (1 - mu)) / mask.sum())
        stt_inc_grid.at[i, 'mu1'] = mu
        stt_inc_grid.at[i, 'se1'] = se
        
    # White voters
    mask_w = mask & (dat_vot['eth'] == 1)
    if mask_w.sum() > 1:
        fit_val = D3.loc[(D3['stt'] == s) & (D3['inc'] == inc), 'vote2008']
        if not fit_val.empty:
            stt_inc_grid.at[i, 'fit2'] = fit_val.values[0]
            
        w = dat_vot.loc[mask_w, 'weight']
        v = dat_vot.loc[mask_w, 'rvote']
        mu = np.average(v, weights=w)
        mu = min(0.98, max(0.02, mu))
        se = np.sqrt((mu * (1 - mu)) / mask_w.sum())
        stt_inc_grid.at[i, 'mu2'] = mu
        stt_inc_grid.at[i, 'se2'] = se

# Plotting Fig 2
fig2 = plt.figure(figsize=(10, 10))
sort_state = np.argsort(dat_stt['rep2008'].values[:n_stt])[::-1] # Descending
count = 0
rows, cols = 7, 7

for idx in sort_state:
    if state_abb[idx] in ["AK", "HI", "DC"]: continue
    
    count += 1
    ax = fig2.add_subplot(rows, cols, count)
    
    # Axis labels only on edges
    if count % cols == 1:
        ax.set_yticks([0.02, 0.5, 0.95])
        ax.set_yticklabels(["0%", "50%", "100%"], fontsize=8)
    else:
        ax.set_yticks([])
        
    if count > (48 - cols):
        ax.set_xticks([0, 2, 4])
        ax.set_xticklabels(["poor", "mid", "rich"], fontsize=8)
    else:
        ax.set_xticks([])
        
    ax.axhline(0.5, color='gray', linewidth=0.5)
    ax.set_ylim(0, 1)
    ax.set_xlim(-0.5, 4.5)
    
    tmp = stt_inc_grid[stt_inc_grid['stt'] == idx]
    
    # All voters (gray)
    ax.errorbar(tmp['inc'], tmp['mu1'], yerr=tmp['se1'], fmt='o', color='gray', markersize=3, elinewidth=1)
    ax.plot(tmp['inc'], tmp['fit1'], color='gray', linewidth=1)
    
    # White voters (orange)
    ax.errorbar(tmp['inc'] + 0.1, tmp['mu2'], yerr=tmp['se2'], fmt='o', color='darkorange', markersize=3, elinewidth=1)
    ax.plot(tmp['inc'], tmp['fit2'], color='darkorange', linewidth=1)
    
    ax.text(2, 0.9, state_name[idx], ha='center', fontsize=9)

plt.tight_layout()
plt.savefig("figs/2.png")
print("Figure 2 saved.")

Generating Figure 2...
Figure 2 saved.

No description has been provided for this image

In [7]:

print("Generating Figure 3...")

fig3, axes = plt.subplots(1, 3, figsize=(18, 6))

# 1. State x Eth
D['grp_plot'] = D.apply(lambda row: f"{row['stt']}_{row['eth']}", axis=1)
D_plot = D.groupby('grp_plot').apply(
    lambda x: pd.Series({
        'eth': x['eth'].iloc[0],
        'vote2008': np.sum(x['vote2008_adj'] * x['turn2008_adj'] * x['pop2008']) / np.sum(x['turn2008_adj'] * x['pop2008']),
        'turn2008': np.sum(x['turn2008_adj'] * x['pop2008']) / np.sum(x['pop2008']),
        'pop2008': np.sum(x['pop2008'])
    }), include_groups=False
).reset_index()

eth_colors = ['white', 'black', 'red', 'green']
bg_colors = [eth_colors[int(e)] for e in D_plot['eth']]
fg_colors = ['grey'] * len(D_plot)

plot_bubble(D_plot['vote2008'], D_plot['turn2008'], np.sqrt(D_plot['pop2008'])/15000, 
            bg=bg_colors, fg=fg_colors, title="State x Ethnicity", 
            xlab="McCain Vote 2008", ylab="Turnout 2008", ax=axes[0])

# 2. State x Eth x Inc
D['grp_plot'] = D.apply(lambda row: f"{row['stt']}_{row['eth']}_{row['inc']}", axis=1)
D_plot = D.groupby('grp_plot').apply(
    lambda x: pd.Series({
        'eth': x['eth'].iloc[0],
        'vote2008': np.sum(x['vote2008_adj'] * x['turn2008_adj'] * x['pop2008']) / np.sum(x['turn2008_adj'] * x['pop2008']),
        'turn2008': np.sum(x['turn2008_adj'] * x['pop2008']) / np.sum(x['pop2008']),
        'pop2008': np.sum(x['pop2008'])
    }), include_groups=False
).reset_index()

bg_colors = [eth_colors[int(e)] for e in D_plot['eth']]
fg_colors = ['grey'] * len(D_plot)
plot_bubble(D_plot['vote2008'], D_plot['turn2008'], np.sqrt(D_plot['pop2008'])/15000, 
            bg=bg_colors, fg=fg_colors, title="State x Ethnicity x Income", 
            xlab="McCain Vote 2008", ylab="", ax=axes[1])

# 3. State x Eth x Inc x Age
D['grp_plot'] = D.apply(lambda row: f"{row['stt']}_{row['eth']}_{row['inc']}_{row['age']}", axis=1)
D_plot = D.groupby('grp_plot').apply(
    lambda x: pd.Series({
        'eth': x['eth'].iloc[0],
        'vote2008': np.sum(x['vote2008_adj'] * x['turn2008_adj'] * x['pop2008']) / np.sum(x['turn2008_adj'] * x['pop2008']),
        'turn2008': np.sum(x['turn2008_adj'] * x['pop2008']) / np.sum(x['pop2008']),
        'pop2008': np.sum(x['pop2008'])
    }), include_groups=False
).reset_index()

bg_colors = [eth_colors[int(e)] for e in D_plot['eth']]
fg_colors = ['grey'] * len(D_plot)
plot_bubble(D_plot['vote2008'], D_plot['turn2008'], np.sqrt(D_plot['pop2008'])/15000, 
            bg=bg_colors, fg=fg_colors, title="State x Ethnicity x Income x Age", 
            xlab="McCain Vote 2008", ylab="", ax=axes[2])

plt.tight_layout()
plt.savefig("figs/3.png")
print("Figure 3 saved.")

Generating Figure 3...

/var/folders/6h/4989_pyx21d34x6ywybxzydc0000gn/T/ipykernel_34372/427786064.py:29: RuntimeWarning: invalid value encountered in scalar divide
  'vote2008': np.sum(x['vote2008_adj'] * x['turn2008_adj'] * x['pop2008']) / np.sum(x['turn2008_adj'] * x['pop2008']),
/var/folders/6h/4989_pyx21d34x6ywybxzydc0000gn/T/ipykernel_34372/427786064.py:30: RuntimeWarning: invalid value encountered in scalar divide
  'turn2008': np.sum(x['turn2008_adj'] * x['pop2008']) / np.sum(x['pop2008']),
/var/folders/6h/4989_pyx21d34x6ywybxzydc0000gn/T/ipykernel_34372/427786064.py:46: RuntimeWarning: invalid value encountered in scalar divide
  'vote2008': np.sum(x['vote2008_adj'] * x['turn2008_adj'] * x['pop2008']) / np.sum(x['turn2008_adj'] * x['pop2008']),
/var/folders/6h/4989_pyx21d34x6ywybxzydc0000gn/T/ipykernel_34372/427786064.py:47: RuntimeWarning: invalid value encountered in scalar divide
  'turn2008': np.sum(x['turn2008_adj'] * x['pop2008']) / np.sum(x['pop2008']),

Figure 3 saved.

In [8]:

print("Generating Figure 4...")

# Prepare data
ok_grp = np.zeros(len(D), dtype=bool)
for i in range(n_stt):
    mask = D['stt'] == i
    tmp = D.loc[mask, 'pop2008'] * D.loc[mask, 'turn2008_adj']
    threshold = tmp.sum() / 100
    ok_grp[mask] = (tmp >= threshold)

fig4, axes = plt.subplots(n_inc, n_age, figsize=(15, 10))
fig4.suptitle("McCain 2008 minus Bush 2004 among whites", fontsize=16)

for i_age in range(n_age):
    for i_inc in range(n_inc):
        ax = axes[i_inc, i_age]
        
        # Filter
        mask = (D['age'] == i_age) & (D['inc'] == i_inc) & (D['eth'] == 0) # eth 0 is White
        subset = D[mask].copy()
        
        # Ensure we have data for all states in order
        # subset might be missing states if they were filtered out earlier, but D is a grid.
        # However, D was created with itertools, so it should be complete.
        # We sort by stt to be sure.
        subset = subset.sort_values('stt')
        
        if len(subset) != n_stt:
            # Should not happen given D construction, but safety check
            subset = subset.set_index('stt').reindex(range(n_stt))
        
        # Calculate values
        vals = (subset['vote2008_adj'] - subset['vote2004_adj']) * 100
        
        # Get colors
        colors = color_groups(vals, [-25, 25], "darkblue", "white", "darkred", alpha=0.75)
        
        # Apply mask for small groups
        # ok_grp is aligned with D. We need it for this subset.
        # Since subset is sorted by stt and filtered by age/inc/eth, 
        # we need to extract corresponding ok_grp values.
        # We can use the index of subset (which comes from D) to index ok_grp.
        subset_indices = subset.index
        grp_mask = ok_grp[subset_indices]
        
        # Set alpha to 0 for masked
        colors[~grp_mask] = [0, 0, 0, 0]
        
        plot_map(ax, colors, title=f"Age {age_label[i_age]}\nInc {inc_label[i_inc]}")

plt.tight_layout()
plt.savefig("figs/4.png")
print("Figure 4 saved.")

Generating Figure 4...
Figure 4 saved.

In [9]:

print("Generating Figure 5...")

agg_mean = np.sum(D['pop2008'] * D['turn2008_adj']) / np.sum(D['pop2008']) - \
           np.sum(D['pop2004'] * D['turn2004_adj']) / np.sum(D['pop2004'])

fig5, axes = plt.subplots(n_age, n_eth, figsize=(12, 12), sharex=True, sharey=True)
fig5.suptitle("Turnout Swing (08-04)", fontsize=16)

for i_eth in range(n_eth):
    for i_age in range(n_age):
        ax = axes[i_age, i_eth]
        
        mask = (D['eth'] == i_eth) & (D['age'] == i_age)
        subset = D[mask]
        
        # Aggregate by income
        res = []
        for i_inc in range(n_inc):
            mask2 = mask & (D['inc'] == i_inc)
            sub2 = D[mask2]
            p04 = sub2['pop2004'].sum()
            t04 = np.sum(sub2['pop2004'] * sub2['turn2004_adj']) / p04 if p04 > 0 else 0
            p08 = sub2['pop2008'].sum()
            t08 = np.sum(sub2['pop2008'] * sub2['turn2008_adj']) / p08 if p08 > 0 else 0
            res.append([p04, t04, p08, t08])
            
        res = np.array(res)
        res[:, 0] = res[:, 0] / res[:, 0].sum() / 2 # pop04
        res[:, 2] = res[:, 2] / res[:, 2].sum() / 2 # pop08
        
        grp_mean = np.sum(subset['pop2008'] * subset['turn2008_adj']) / np.sum(subset['pop2008']) - \
                   np.sum(subset['pop2004'] * subset['turn2004_adj']) / np.sum(subset['pop2004'])
                   
        col1 = "lightblue" if grp_mean > 0.05 else "lightgrey"
        col2 = "darkblue" if grp_mean > 0.05 else "black"
        
        x = np.arange(n_inc)
        ax.bar(x, res[:, 2], color=col1, width=0.8)
        
        swing = res[:, 3] - res[:, 1]
        ax.plot(x, swing, 'o-', color=col2, markersize=4)
        
        ax.axhline(0, linestyle=':', color='black')
        ax.axhline(agg_mean, linestyle=':', color='red')
        
        ax.set_ylim(-0.05, 0.2)
        
        if i_age == 0:
            ax.set_title(eth_label[i_eth])
        if i_eth == 0:
            ax.set_ylabel(age_label[i_age])
            
        if i_age == 3:
            ax.set_xticks(x)
            ax.set_xticklabels(inc_label, rotation=90)

plt.tight_layout()
plt.savefig("figs/5.png")
print("Figure 5 saved.")

Generating Figure 5...
Figure 5 saved.

In [10]:

print("Generating Figure 6...")

D['plot_grp'] = D.apply(lambda row: f"{row['stt']}_{row['eth']}_{row['age']}", axis=1)
D3 = D.groupby('plot_grp').apply(
    lambda x: pd.Series({
        'stt': x['stt'].iloc[0],
        'eth': x['eth'].iloc[0],
        'age': x['age'].iloc[0],
        'pop2008': np.sum(x['pop2008']),
        'turn2008': np.sum(x['pop2008'] * x['turn2008_adj']) / np.sum(x['pop2008'])
    }), include_groups=False
).reset_index()

ok_grp = np.zeros(len(D3), dtype=bool)
for i in range(n_stt):
    mask = D3['stt'] == i
    tmp = D3.loc[mask, 'pop2008']
    threshold = tmp.sum() / 100
    ok_grp[mask] = (tmp >= threshold)

from matplotlib.colors import LinearSegmentedColormap
fig6, axes = plt.subplots(n_age, n_eth, figsize=(15, 12))
fig6.suptitle("Turnout among Young Whites and Minorities Over-Emphasized", fontsize=16)

dark_red = [0.5, 0.0, 0.0]
white = [1.0, 1.0, 1.0]
dark_green = [0.0, 0.5, 0.0]
color_nodes = [
    (0.0, dark_red),    # Start at 0.0 (vmin) with dark_red
    (0.6, white),       # Pass through white at the 0.6 point
    (1.0, dark_green)   # End at 1.0 (vmax) with dark_green
]
custom_cmap = LinearSegmentedColormap.from_list("RedWhiteGreen", color_nodes)

for i_age in range(n_age):
    for i_eth in range(n_eth):
        ax = axes[i_age, i_eth]
        
        mask = (D3['age'] == i_age) & (D3['eth'] == i_eth)
        subset = D3[mask].copy()
        
        subset = subset.sort_values('stt')
        if len(subset) != n_stt:
            subset = subset.set_index('stt').reindex(range(n_stt))
            
        vals = subset['turn2008']
        colors = color_groups(vals, [0.4, 0.8], "darkred", "white", "green", alpha=0.75)
        
        # Apply mask
        subset_indices = subset.index
        grp_mask = ok_grp[subset_indices]
        colors[~grp_mask] = [0, 0, 0, 0]
       
        rplot = plot_map(ax, colors, title="")
        
        ax.axis("on")
        ax.tick_params(left=False, bottom=False, labelleft=False, labelbottom=False)
        for spine in ax.spines.values():
            spine.set_visible(False)
        if i_eth == 0:
            ax.set_ylabel(age_label[i_age], loc="center", fontsize=12)
            
        # Apply X-label only to the last row (i_age == n_age - 1)
        if i_age == 0:
            ax.set_title(eth_label[i_eth], y=1.0, fontsize=12)
            # ax.set_xlabel(eth_label[i_eth], loc="center")
            # ax.xaxis.set_label_position('top')

norm = mpl.colors.Normalize(vmin=np.min(colors[colors != 0]), vmax=np.max(colors))
sm = plt.cm.ScalarMappable(cmap=custom_cmap, norm=norm)
sm.set_array([]) 
cbar = fig6.colorbar(sm, ax=ax)
cbar.set_label('Turnout', rotation=270, labelpad=15)
plt.tight_layout()
plt.savefig("figs/6.png")
print("Figure 6 saved.")

Generating Figure 6...

/var/folders/6h/4989_pyx21d34x6ywybxzydc0000gn/T/ipykernel_34372/1415949281.py:10: RuntimeWarning: invalid value encountered in scalar divide
  'turn2008': np.sum(x['pop2008'] * x['turn2008_adj']) / np.sum(x['pop2008'])

Figure 6 saved.

$No description has been provided for this image$

In [11]:

year = 2008
z_incstt_vals = L_z_incstt_map[year]
z_repprv_vals = L_z_repprv_map[year]
z_trnprv_vals = L_z_trnprv_map[year]

D['z_incstt'] = D['stt'].apply(lambda x: z_incstt_vals[x])
D['z_repprv'] = D['stt'].apply(lambda x: z_repprv_vals[x])
D['z_trnprv'] = D['stt'].apply(lambda x: z_trnprv_vals[x])

# --- Turnout Model ---
print("*****   CPS Turnout Model", flush=True)
data_cps = get_model_data(year, dat_cps, D, 'vote')

# Filter observed
observed_mask = data_cps['trials_int'] > 0
df_obs = data_cps[observed_mask].copy()

# Add interaction columns to df_obs (they are in D, merged in get_model_data)
# get_model_data merges D_grid (which has interactions) with stats.
# So df_obs has them.

# Config
if run_full_models:
    turnout_config = {
        'fixed_effects': ['z_inc', 'z_incstt', 'z_trnprv'],
        'fixed_interactions': [('z_inc', 'z_incstt'), ('z_inc', 'z_trnprv')],
        'random_effects': {
            'reg': {'idx_col': 'reg', 'n_cats': n_reg, 'slope_col': 'z_inc'},
            'stt': {'idx_col': 'stt', 'n_cats': n_stt, 'slope_col': 'z_inc'},
            'eth': {'idx_col': 'eth', 'n_cats': n_eth, 'slope_col': 'z_inc'},
            'inc': {'idx_col': 'inc', 'n_cats': n_inc, 'slope_col': None},
            'age': {'idx_col': 'age', 'n_cats': n_age, 'slope_col': 'z_inc'},
        }
    }
    
    # Add interactions to random effects
    for cols in interaction_cols:
        name = "_".join(cols)
        n_cats = int(D[name].max() + 1)
        turnout_config['random_effects'][name] = {'idx_col': name, 'n_cats': n_cats, 'slope_col': None}
else:
    turnout_config = {
        'fixed_effects': [],
        'random_effects': {
            'stt': {'idx_col': 'stt', 'n_cats': n_stt, 'slope_col': None},
            'eth': {'idx_col': 'eth', 'n_cats': n_eth, 'slope_col': None},
            'inc': {'idx_col': 'inc', 'n_cats': n_inc, 'slope_col': None},
            'age': {'idx_col': 'age', 'n_cats': n_age, 'slope_col': None},
        }
    }
    
coords = {"obs_id": df_obs.index}

builder = MRPModelBuilder(df_obs, coords, turnout_config)
print("Building Turnout Model...", flush=True)
builder.build_model()
print("Fitting Turnout Model...", flush=True)
builder.fit(method="pathfinder", num_paths=10, num_draws=200, random_seed=42)

*****   CPS Turnout Model
Building Turnout Model...
Fitting Turnout Model...

Output()

/Users/erik/Projects/pytensor/pytensor/tensor/math.py:3023: RuntimeWarning: divide by zero encountered in matmul
  output_storage[0][0] = np.matmul(*inputs)

/Users/erik/Projects/pytensor/pytensor/tensor/math.py:3023: RuntimeWarning: overflow encountered in matmul
  output_storage[0][0] = np.matmul(*inputs)

/Users/erik/Projects/pytensor/pytensor/tensor/math.py:3023: RuntimeWarning: invalid value encountered in matmul
  output_storage[0][0] = np.matmul(*inputs)

Pathfinder Results                            
                                              
  No. model parameters     941                
                                              
  Configuration:                              
  num_draws_per_path       1000               
  history size (maxcor)    21                 
  max iterations           1000               
  ftol                     1.00e-05           
  gtol                     1.00e-08           
  max line search          1000               
  jitter                   2.0                
  epsilon                  1.00e-08           
  ELBO draws               10                 
                                              
  LBFGS Status:                               
  CONVERGED                7                  
  INIT_FAILED              3                  
  L-BFGS iterations        mean 284 ± std 43  
                                              
  Path Status:                                
  SUCCESS                  7                  
  LBFGS_FAILED             3                  
  ELBO argmax              mean 79 ± std 22   
                                              
  Importance Sampling:                        
  Method                   psis               
  Pareto k                 9.31               
                                              
  Timing (seconds):                           
  Compile                  2.88               
  Compute                  83.67              
  Total                    86.55              

Warnings:
  - INIT_FAILED: LBFGS failed to initialize. Consider reparameterizing the model or reducing jitter if this        
  occurence is high relative to the number of paths.

Output()

Out[11]:

arviz.InferenceData

In [ ]: