import pandas as pd
import numpy as np
import statsmodels.api as sm
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

# load data from a csv into a table
dat = pd.read_csv('ppv_data.csv') # set your own directory
dat.head()

# transfer columns into categorical
cat_vars = ['monk', 'session', 'piccat']
for col in cat_vars:
    dat[col] = dat[col].astype('category')
print(dat.dtypes)

monk        category
session     category
piccat      category
dv             int64
Nimg           int64
Ntot           int64
sessdate      object
dtype: object

# subsetting the data
# Extract a subset of the data to use in developing the analysis. 
# For instance, session 9, picture category 4.
subset = dat.loc[(dat['session'] == 9) & (dat['piccat'] == 4), :]
subset.head()

# Remove any rows in the data subset that correspond to no observations (Ntot = 0).
subset = subset.loc[subset['Ntot'] != 0, :]
subset.head()

# Examining the data
# For each value difference in your data subset, 
# calculate the proportion of trials on which the animal chose the juice-plus-image option.
subset['proportion'] = subset['Nimg'] / subset['Ntot']
subset.head()

# Plot this proportion as a set of points.
sns.scatterplot(subset, x = 'dv', y = 'proportion')

<Axes: xlabel='dv', ylabel='proportion'>

# fit GLM
endog = [subset['Nimg'], subset['Ntot'] - subset['Nimg']]
endog = pd.concat(endog, axis=1)
endog.columns = ['Nimg', 'Nnot']
exog = sm.add_constant(subset['dv'], prepend = False)

fitc = sm.GLM(endog, exog, family = sm.families.Binomial()).fit()
fitc.summary()

# check the fit
# new dv values
fit_dv = np.linspace(-150, 150, num = 301)

# generate predictions
# get_prediction() returns an object contains many informations,
# and you need to obtain the predicted values with predicted_mean
# fit_p = fitc.get_prediction(sm.add_constant(fit_dv, prepend = False)).predicted_mean
# predict() only returns the predicted values
fit_p = fitc.predict(sm.add_constant(fit_dv, prepend = False))

# make plot
ax = sns.scatterplot(subset, x = 'dv', y = 'proportion')
ax.plot(fit_dv, fit_p, 'r')
ax.set(xlabel='dv', ylabel='proportion')

# calculate image values
img_val = fitc.params.iloc[1] / fitc.params.iloc[0]
p_slope = fitc.pvalues.iloc[0]

# function fit_model
def fit_model(tbl):
    # exclude no oberservation
    tbl = tbl.loc[tbl['Ntot'] != 0, :]
    
    # prepare for regression
    endog = [tbl['Nimg'], tbl['Ntot'] - tbl['Nimg']]
    endog = pd.concat(endog, axis=1)
    endog.columns = ['Nimg', 'Nnot']
    exog = sm.add_constant(tbl['dv'], prepend = False)

    try:
        # sm.GLM can produce a lot of warning when this function is applied to every entry
        # The with ... block will catch and ignore these warnings
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            # fit logistic model by GLM
            fitc = sm.GLM(endog, exog, family = sm.families.Binomial(), missing = 'drop').fit()
    
        # calculate image value and p value
        img_val = fitc.params.iloc[1] / fitc.params.iloc[0]
        p_slope = fitc.pvalues.iloc[0]
    except:
        # assign value > 1000
        img_val = 2000
        p_slope = 2000
        
    tbl_out = pd.Series([img_val, p_slope], index = ['img_value', 'p_slope'])
    
    return tbl_out

# observed=True saves you memory when you have unobserved categorical values.
# For example, assume the piccat categories go from 0 to 6, but your dataset only has 1, 3, 5.
# In this case, observed=True will return a DataFrame with only 1, 3, 5,
# but observed=False will return a DF with 0 to 9, and under 0, 2, 4, 6, everything is zero or NaN.
results = dat.groupby(['session', 'piccat'], observed = True).apply(fit_model)

/tmp/ipykernel_20539/2311991723.py:21: RuntimeWarning: divide by zero encountered in scalar divide
  img_val = fitc.params.iloc[1] / fitc.params.iloc[0]
/tmp/ipykernel_20539/2311991723.py:21: RuntimeWarning: divide by zero encountered in scalar divide
  img_val = fitc.params.iloc[1] / fitc.params.iloc[0]
/tmp/ipykernel_20539/2311991723.py:21: RuntimeWarning: divide by zero encountered in scalar divide
  img_val = fitc.params.iloc[1] / fitc.params.iloc[0]
/tmp/ipykernel_20539/2311991723.py:21: RuntimeWarning: divide by zero encountered in scalar divide
  img_val = fitc.params.iloc[1] / fitc.params.iloc[0]

# clean results and produce boxplot
# clean results
results = results.loc[np.abs(results['img_value']) <= 1000]
results.reset_index(level = 1, inplace = True)

# generate box plot
sns.boxplot(x = 'piccat', y = 'img_value', data = results[['piccat', 'img_value']],
            flierprops = {"marker": 'x'})

<Axes: xlabel='piccat', ylabel='img_value'>

	monk	session	piccat	dv	Nimg	Ntot	sessdate
0	1	1	1	-20	0	24	2008-03-15
1	1	1	2	-20	13	24	2008-03-15
2	1	1	3	-20	2	24	2008-03-15
3	1	1	4	-20	11	24	2008-03-15
4	1	1	1	-10	10	24	2008-03-15

	monk	session	piccat	dv	Nimg	Ntot	sessdate
175	1	9	4	-120	2	21	2008-03-28
179	1	9	4	-100	0	0	2008-03-28
183	1	9	4	-40	7	30	2008-03-28
187	1	9	4	0	15	32	2008-03-28
191	1	9	4	40	25	30	2008-03-28

	monk	session	piccat	dv	Nimg	Ntot	sessdate
175	1	9	4	-120	2	21	2008-03-28
183	1	9	4	-40	7	30	2008-03-28
187	1	9	4	0	15	32	2008-03-28
191	1	9	4	40	25	30	2008-03-28
199	1	9	4	120	18	20	2008-03-28

	monk	session	piccat	dv	Nimg	Ntot	sessdate	proportion
175	1	9	4	-120	2	21	2008-03-28	0.095238
183	1	9	4	-40	7	30	2008-03-28	0.233333
187	1	9	4	0	15	32	2008-03-28	0.468750
191	1	9	4	40	25	30	2008-03-28	0.833333
199	1	9	4	120	18	20	2008-03-28	0.900000

Dep. Variable:	['Nimg', 'Nnot']	No. Observations:	5
Model:	GLM	Df Residuals:	3
Model Family:	Binomial	Df Model:	1
Link Function:	Logit	Scale:	1.0000
Method:	IRLS	Log-Likelihood:	-9.7379
Date:	Sun, 18 Feb 2024	Deviance:	3.6799
Time:	09:09:39	Pearson chi2:	3.69
No. Iterations:	5	Pseudo R-squ. (CS):	1.000
Covariance Type:	nonrobust

	coef	std err	z	P>\|z\|	[0.025	0.975]
dv	0.0233	0.004	5.288	0.000	0.015	0.032
const	0.0421	0.212	0.199	0.842	-0.372	0.457