In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
# load data from a csv into a table
dat = pd.read_csv('ppv_data.csv') # set your own directory
dat.head()
Out[2]:
monk | session | piccat | dv | Nimg | Ntot | sessdate | |
---|---|---|---|---|---|---|---|
0 | 1 | 1 | 1 | -20 | 0 | 24 | 2008-03-15 |
1 | 1 | 1 | 2 | -20 | 13 | 24 | 2008-03-15 |
2 | 1 | 1 | 3 | -20 | 2 | 24 | 2008-03-15 |
3 | 1 | 1 | 4 | -20 | 11 | 24 | 2008-03-15 |
4 | 1 | 1 | 1 | -10 | 10 | 24 | 2008-03-15 |
In [3]:
# transfer columns into categorical
cat_vars = ['monk', 'session', 'piccat']
for col in cat_vars:
dat[col] = dat[col].astype('category')
print(dat.dtypes)
monk category session category piccat category dv int64 Nimg int64 Ntot int64 sessdate object dtype: object
In [4]:
# subsetting the data
# Extract a subset of the data to use in developing the analysis.
# For instance, session 9, picture category 4.
subset = dat.loc[(dat['session'] == 9) & (dat['piccat'] == 4), :]
subset.head()
Out[4]:
monk | session | piccat | dv | Nimg | Ntot | sessdate | |
---|---|---|---|---|---|---|---|
175 | 1 | 9 | 4 | -120 | 2 | 21 | 2008-03-28 |
179 | 1 | 9 | 4 | -100 | 0 | 0 | 2008-03-28 |
183 | 1 | 9 | 4 | -40 | 7 | 30 | 2008-03-28 |
187 | 1 | 9 | 4 | 0 | 15 | 32 | 2008-03-28 |
191 | 1 | 9 | 4 | 40 | 25 | 30 | 2008-03-28 |
In [5]:
# Remove any rows in the data subset that correspond to no observations (Ntot = 0).
subset = subset.loc[subset['Ntot'] != 0, :]
subset.head()
Out[5]:
monk | session | piccat | dv | Nimg | Ntot | sessdate | |
---|---|---|---|---|---|---|---|
175 | 1 | 9 | 4 | -120 | 2 | 21 | 2008-03-28 |
183 | 1 | 9 | 4 | -40 | 7 | 30 | 2008-03-28 |
187 | 1 | 9 | 4 | 0 | 15 | 32 | 2008-03-28 |
191 | 1 | 9 | 4 | 40 | 25 | 30 | 2008-03-28 |
199 | 1 | 9 | 4 | 120 | 18 | 20 | 2008-03-28 |
In [6]:
# Examining the data
# For each value difference in your data subset,
# calculate the proportion of trials on which the animal chose the juice-plus-image option.
subset['proportion'] = subset['Nimg'] / subset['Ntot']
subset.head()
Out[6]:
monk | session | piccat | dv | Nimg | Ntot | sessdate | proportion | |
---|---|---|---|---|---|---|---|---|
175 | 1 | 9 | 4 | -120 | 2 | 21 | 2008-03-28 | 0.095238 |
183 | 1 | 9 | 4 | -40 | 7 | 30 | 2008-03-28 | 0.233333 |
187 | 1 | 9 | 4 | 0 | 15 | 32 | 2008-03-28 | 0.468750 |
191 | 1 | 9 | 4 | 40 | 25 | 30 | 2008-03-28 | 0.833333 |
199 | 1 | 9 | 4 | 120 | 18 | 20 | 2008-03-28 | 0.900000 |
In [7]:
# Plot this proportion as a set of points.
sns.scatterplot(subset, x = 'dv', y = 'proportion')
Out[7]:
<Axes: xlabel='dv', ylabel='proportion'>
In [8]:
# fit GLM
endog = [subset['Nimg'], subset['Ntot'] - subset['Nimg']]
endog = pd.concat(endog, axis=1)
endog.columns = ['Nimg', 'Nnot']
exog = sm.add_constant(subset['dv'], prepend = False)
fitc = sm.GLM(endog, exog, family = sm.families.Binomial()).fit()
fitc.summary()
Out[8]:
Dep. Variable: | ['Nimg', 'Nnot'] | No. Observations: | 5 |
---|---|---|---|
Model: | GLM | Df Residuals: | 3 |
Model Family: | Binomial | Df Model: | 1 |
Link Function: | Logit | Scale: | 1.0000 |
Method: | IRLS | Log-Likelihood: | -9.7379 |
Date: | Sun, 18 Feb 2024 | Deviance: | 3.6799 |
Time: | 09:09:39 | Pearson chi2: | 3.69 |
No. Iterations: | 5 | Pseudo R-squ. (CS): | 1.000 |
Covariance Type: | nonrobust |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
dv | 0.0233 | 0.004 | 5.288 | 0.000 | 0.015 | 0.032 |
const | 0.0421 | 0.212 | 0.199 | 0.842 | -0.372 | 0.457 |
In [9]:
# check the fit
# new dv values
fit_dv = np.linspace(-150, 150, num = 301)
# generate predictions
# get_prediction() returns an object contains many informations,
# and you need to obtain the predicted values with predicted_mean
# fit_p = fitc.get_prediction(sm.add_constant(fit_dv, prepend = False)).predicted_mean
# predict() only returns the predicted values
fit_p = fitc.predict(sm.add_constant(fit_dv, prepend = False))
# make plot
ax = sns.scatterplot(subset, x = 'dv', y = 'proportion')
ax.plot(fit_dv, fit_p, 'r')
ax.set(xlabel='dv', ylabel='proportion')
# calculate image values
img_val = fitc.params.iloc[1] / fitc.params.iloc[0]
p_slope = fitc.pvalues.iloc[0]
In [10]:
# function fit_model
def fit_model(tbl):
# exclude no oberservation
tbl = tbl.loc[tbl['Ntot'] != 0, :]
# prepare for regression
endog = [tbl['Nimg'], tbl['Ntot'] - tbl['Nimg']]
endog = pd.concat(endog, axis=1)
endog.columns = ['Nimg', 'Nnot']
exog = sm.add_constant(tbl['dv'], prepend = False)
try:
# sm.GLM can produce a lot of warning when this function is applied to every entry
# The with ... block will catch and ignore these warnings
with warnings.catch_warnings():
warnings.simplefilter('ignore')
# fit logistic model by GLM
fitc = sm.GLM(endog, exog, family = sm.families.Binomial(), missing = 'drop').fit()
# calculate image value and p value
img_val = fitc.params.iloc[1] / fitc.params.iloc[0]
p_slope = fitc.pvalues.iloc[0]
except:
# assign value > 1000
img_val = 2000
p_slope = 2000
tbl_out = pd.Series([img_val, p_slope], index = ['img_value', 'p_slope'])
return tbl_out
# observed=True saves you memory when you have unobserved categorical values.
# For example, assume the piccat categories go from 0 to 6, but your dataset only has 1, 3, 5.
# In this case, observed=True will return a DataFrame with only 1, 3, 5,
# but observed=False will return a DF with 0 to 9, and under 0, 2, 4, 6, everything is zero or NaN.
results = dat.groupby(['session', 'piccat'], observed = True).apply(fit_model)
/tmp/ipykernel_20539/2311991723.py:21: RuntimeWarning: divide by zero encountered in scalar divide img_val = fitc.params.iloc[1] / fitc.params.iloc[0] /tmp/ipykernel_20539/2311991723.py:21: RuntimeWarning: divide by zero encountered in scalar divide img_val = fitc.params.iloc[1] / fitc.params.iloc[0] /tmp/ipykernel_20539/2311991723.py:21: RuntimeWarning: divide by zero encountered in scalar divide img_val = fitc.params.iloc[1] / fitc.params.iloc[0] /tmp/ipykernel_20539/2311991723.py:21: RuntimeWarning: divide by zero encountered in scalar divide img_val = fitc.params.iloc[1] / fitc.params.iloc[0]
In [11]:
# clean results and produce boxplot
# clean results
results = results.loc[np.abs(results['img_value']) <= 1000]
results.reset_index(level = 1, inplace = True)
# generate box plot
sns.boxplot(x = 'piccat', y = 'img_value', data = results[['piccat', 'img_value']],
flierprops = {"marker": 'x'})
Out[11]:
<Axes: xlabel='piccat', ylabel='img_value'>