1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import statsmodels.api as sm
import statsmodels.stats.api as sms
import seaborn as sns
from scipy import stats
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import scikit_posthocs as sp
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import f_regression
from statsmodels.graphics.gofplots import ProbPlot
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
import statsmodels.formula.api as smf

df=pd.read_csv('Exam\\Data_Bank_Term_2.csv')

# Question 1: Mean, Median, SD
df['balance'].mean()
df['balance'].median()
df['balance'].std()

# Question 2: Is there a significant effect of IV on DV? (duration ~ loan in this example)
# 1st step (front df is DV, rear df is IV with level)
sample1 = df['duration'][df['y']=='yes']
sample2 = df['duration'][df['y']=='no']
# Check Normality
stats.shapiro(df['duration'])
# Try data transformation, but now no answer
stats.shapiro(np.log(df['duration']))
stats.shapiro(np.sqrt(df['duration']))
# Check Homogeneity
stats.levene(sample1,sample2)
# Check degree of freedom to report result
model = ols('duration ~ y', data=df).fit()
model.summary()
# Use non-parametric test Wilcoxon-Rank sum test
stats.wilcoxon(sample1,sample2)
# Use Mann-Whitney U as samples do not have same length
stats.mannwhitneyu(sample1,sample2)
# Calculate Effect sizes cohen'd
def cohen_d(x, y):
    nx = len(x)
    ny = len(y)
    dof = nx + ny - 2
    return (np.mean(x) - np.mean(y)) / np.sqrt(
        ((nx - 1) * np.std(x, ddof=1) ** 2 + (ny - 1) * np.std(y, ddof=1) ** 2) / dof)
# Then simply execute
cohen_d(sample1,sample2)

# Question 3: Draw CI plot for previous question
plt.vlines(x=0.25,
           ymin=sms.DescrStatsW(sample1).tconfint_mean()[0],
           ymax=sms.DescrStatsW(sample1).tconfint_mean()[1],
           colors='blue')
plt.vlines(x=0.75,
           ymin=sms.DescrStatsW(sample2).tconfint_mean()[0],
           ymax=sms.DescrStatsW(sample2).tconfint_mean()[1],
           colors='red')
plt.xlim(0,1)
plt.xticks(ticks=[],labels='')
plt.show()

# Question 4: Is there a significant effect of IV on DV? (age ~ marital)
# If so, in which group?
# 1st step, check the level of DV, more than 2 levels
mar1 = df['age'][df.marital=='single']
mar2 = df['age'][df.marital=='divorced']
mar3 = df['age'][df.marital=='married']
# Check Normality
stats.shapiro(df['age'])
# Try data transformation, but now no answer
stats.shapiro(np.log(df['age']))
stats.shapiro(np.sqrt(df['age']))
# Check Homogeneity
stats.levene(mar1,mar2,mar3)
# Check degree of freedom to report result
model = ols('age ~ marital', data=df).fit()
model.summary()
# Use Kruskal-Wallis as non-parametric alternative if there is violation of parametric assumption
stats.kruskal(mar1,mar2,mar3)
# Run Tukey HSD tell where the difference is
thsd = pairwise_tukeyhsd(df['age'], df['marital'])
# Print it
print(thsd.summary())

# Alternative use Dunn test, create a second dataframe to perform the Dunn PostHoc test
df2=df[['IV','DV']]
df2.columns = ['groups','values']
sp.posthoc_dunn(df2, val_col='values', group_col='groups')
# See it row by row
test1 = sp.posthoc_dunn(df2, val_col='values', group_col='groups')
test1.iloc[0]
test1.iloc[1]

# Question 5: Is there a correlation between two groups? (age and duration in this case)
# Check bivariate normality
stats.shapiro(df['balance'])
stats.shapiro(df['duration'])
# Calculate Pearson R if there is no violation
stats.pearsonr(df['balance'], df['duration'])
# Calculate Spearman Rho
stats.spearmanr(df['balance'], df['duration'])

# Question 6: Draw Regression Plot, Set balance as Y-axis (balance and duration, and y is balance)
sns.lmplot('age', 'balance', data=df, line_kws={'color': 'red'})
plt.show()

# Question 7: Create multiple linear regression (balance ~ age + education) example
model = ols('balance ~ age + education', data=df).fit()
model.summary()

# Question 8: Upload diagnostic plot for lm
model_norm_residuals = model.get_influence().resid_studentized_internal
QQ = ProbPlot(model_norm_residuals)
QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1)
plt.title('Normal Q-Q')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Standardized Residuals')
plt.show()