In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
# load data from a csv into a table
dat = pd.read_csv('ppv_data.csv') # set your own directory
dat.head()
Out[2]:
monk session piccat dv Nimg Ntot sessdate
0 1 1 1 -20 0 24 2008-03-15
1 1 1 2 -20 13 24 2008-03-15
2 1 1 3 -20 2 24 2008-03-15
3 1 1 4 -20 11 24 2008-03-15
4 1 1 1 -10 10 24 2008-03-15
In [3]:
# transfer columns into categorical
cat_vars = ['monk', 'session', 'piccat']
for col in cat_vars:
    dat[col] = dat[col].astype('category')
print(dat.dtypes)
monk        category
session     category
piccat      category
dv             int64
Nimg           int64
Ntot           int64
sessdate      object
dtype: object
In [4]:
# subsetting the data
# Extract a subset of the data to use in developing the analysis. 
# For instance, session 9, picture category 4.
subset = dat.loc[(dat['session'] == 9) & (dat['piccat'] == 4), :]
subset.head()
Out[4]:
monk session piccat dv Nimg Ntot sessdate
175 1 9 4 -120 2 21 2008-03-28
179 1 9 4 -100 0 0 2008-03-28
183 1 9 4 -40 7 30 2008-03-28
187 1 9 4 0 15 32 2008-03-28
191 1 9 4 40 25 30 2008-03-28
In [5]:
# Remove any rows in the data subset that correspond to no observations (Ntot = 0).
subset = subset.loc[subset['Ntot'] != 0, :]
subset.head()
Out[5]:
monk session piccat dv Nimg Ntot sessdate
175 1 9 4 -120 2 21 2008-03-28
183 1 9 4 -40 7 30 2008-03-28
187 1 9 4 0 15 32 2008-03-28
191 1 9 4 40 25 30 2008-03-28
199 1 9 4 120 18 20 2008-03-28
In [6]:
# Examining the data
# For each value difference in your data subset, 
# calculate the proportion of trials on which the animal chose the juice-plus-image option.
subset['proportion'] = subset['Nimg'] / subset['Ntot']
subset.head()
Out[6]:
monk session piccat dv Nimg Ntot sessdate proportion
175 1 9 4 -120 2 21 2008-03-28 0.095238
183 1 9 4 -40 7 30 2008-03-28 0.233333
187 1 9 4 0 15 32 2008-03-28 0.468750
191 1 9 4 40 25 30 2008-03-28 0.833333
199 1 9 4 120 18 20 2008-03-28 0.900000
In [7]:
# Plot this proportion as a set of points.
sns.scatterplot(subset, x = 'dv', y = 'proportion')
Out[7]:
<Axes: xlabel='dv', ylabel='proportion'>
No description has been provided for this image
In [8]:
# fit GLM
endog = [subset['Nimg'], subset['Ntot'] - subset['Nimg']]
endog = pd.concat(endog, axis=1)
endog.columns = ['Nimg', 'Nnot']
exog = sm.add_constant(subset['dv'], prepend = False)

fitc = sm.GLM(endog, exog, family = sm.families.Binomial()).fit()
fitc.summary()
Out[8]:
Generalized Linear Model Regression Results
Dep. Variable: ['Nimg', 'Nnot'] No. Observations: 5
Model: GLM Df Residuals: 3
Model Family: Binomial Df Model: 1
Link Function: Logit Scale: 1.0000
Method: IRLS Log-Likelihood: -9.7379
Date: Sun, 18 Feb 2024 Deviance: 3.6799
Time: 09:09:39 Pearson chi2: 3.69
No. Iterations: 5 Pseudo R-squ. (CS): 1.000
Covariance Type: nonrobust
coef std err z P>|z| [0.025 0.975]
dv 0.0233 0.004 5.288 0.000 0.015 0.032
const 0.0421 0.212 0.199 0.842 -0.372 0.457
In [9]:
# check the fit
# new dv values
fit_dv = np.linspace(-150, 150, num = 301)

# generate predictions
# get_prediction() returns an object contains many informations,
# and you need to obtain the predicted values with predicted_mean
# fit_p = fitc.get_prediction(sm.add_constant(fit_dv, prepend = False)).predicted_mean
# predict() only returns the predicted values
fit_p = fitc.predict(sm.add_constant(fit_dv, prepend = False))

# make plot
ax = sns.scatterplot(subset, x = 'dv', y = 'proportion')
ax.plot(fit_dv, fit_p, 'r')
ax.set(xlabel='dv', ylabel='proportion')

# calculate image values
img_val = fitc.params.iloc[1] / fitc.params.iloc[0]
p_slope = fitc.pvalues.iloc[0]
No description has been provided for this image
In [10]:
# function fit_model
def fit_model(tbl):
    # exclude no oberservation
    tbl = tbl.loc[tbl['Ntot'] != 0, :]
    
    # prepare for regression
    endog = [tbl['Nimg'], tbl['Ntot'] - tbl['Nimg']]
    endog = pd.concat(endog, axis=1)
    endog.columns = ['Nimg', 'Nnot']
    exog = sm.add_constant(tbl['dv'], prepend = False)

    try:
        # sm.GLM can produce a lot of warning when this function is applied to every entry
        # The with ... block will catch and ignore these warnings
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            # fit logistic model by GLM
            fitc = sm.GLM(endog, exog, family = sm.families.Binomial(), missing = 'drop').fit()
    
        # calculate image value and p value
        img_val = fitc.params.iloc[1] / fitc.params.iloc[0]
        p_slope = fitc.pvalues.iloc[0]
    except:
        # assign value > 1000
        img_val = 2000
        p_slope = 2000
        
    tbl_out = pd.Series([img_val, p_slope], index = ['img_value', 'p_slope'])
    
    return tbl_out

# observed=True saves you memory when you have unobserved categorical values.
# For example, assume the piccat categories go from 0 to 6, but your dataset only has 1, 3, 5.
# In this case, observed=True will return a DataFrame with only 1, 3, 5,
# but observed=False will return a DF with 0 to 9, and under 0, 2, 4, 6, everything is zero or NaN.
results = dat.groupby(['session', 'piccat'], observed = True).apply(fit_model)
/tmp/ipykernel_20539/2311991723.py:21: RuntimeWarning: divide by zero encountered in scalar divide
  img_val = fitc.params.iloc[1] / fitc.params.iloc[0]
/tmp/ipykernel_20539/2311991723.py:21: RuntimeWarning: divide by zero encountered in scalar divide
  img_val = fitc.params.iloc[1] / fitc.params.iloc[0]
/tmp/ipykernel_20539/2311991723.py:21: RuntimeWarning: divide by zero encountered in scalar divide
  img_val = fitc.params.iloc[1] / fitc.params.iloc[0]
/tmp/ipykernel_20539/2311991723.py:21: RuntimeWarning: divide by zero encountered in scalar divide
  img_val = fitc.params.iloc[1] / fitc.params.iloc[0]
In [11]:
# clean results and produce boxplot
# clean results
results = results.loc[np.abs(results['img_value']) <= 1000]
results.reset_index(level = 1, inplace = True)

# generate box plot
sns.boxplot(x = 'piccat', y = 'img_value', data = results[['piccat', 'img_value']],
            flierprops = {"marker": 'x'})
Out[11]:
<Axes: xlabel='piccat', ylabel='img_value'>
No description has been provided for this image