1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | import pandas as pd import numpy as np from matplotlib import pyplot as plt import statsmodels.api as sm import statsmodels.stats.api as sms import seaborn as sns from scipy import stats from statsmodels.formula.api import ols from statsmodels.stats.multicomp import pairwise_tukeyhsd import scikit_posthocs as sp from sklearn.model_selection import train_test_split from sklearn.feature_selection import f_regression from statsmodels.graphics.gofplots import ProbPlot from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report from sklearn.metrics import roc_curve import statsmodels.formula.api as smf df=pd.read_csv('Exam\\Data_Bank_Term_2.csv') # Question 1: Mean, Median, SD df['balance'].mean() df['balance'].median() df['balance'].std() # Question 2: Is there a significant effect of IV on DV? (duration ~ loan in this example) # 1st step (front df is DV, rear df is IV with level) sample1 = df['duration'][df['y']=='yes'] sample2 = df['duration'][df['y']=='no'] # Check Normality stats.shapiro(df['duration']) # Try data transformation, but now no answer stats.shapiro(np.log(df['duration'])) stats.shapiro(np.sqrt(df['duration'])) # Check Homogeneity stats.levene(sample1,sample2) # Check degree of freedom to report result model = ols('duration ~ y', data=df).fit() model.summary() # Use non-parametric test Wilcoxon-Rank sum test stats.wilcoxon(sample1,sample2) # Use Mann-Whitney U as samples do not have same length stats.mannwhitneyu(sample1,sample2) # Calculate Effect sizes cohen'd def cohen_d(x, y): nx = len(x) ny = len(y) dof = nx + ny - 2 return (np.mean(x) - np.mean(y)) / np.sqrt( ((nx - 1) * np.std(x, ddof=1) ** 2 + (ny - 1) * np.std(y, ddof=1) ** 2) / dof) # Then simply execute cohen_d(sample1,sample2) # Question 3: Draw CI plot for previous question plt.vlines(x=0.25, ymin=sms.DescrStatsW(sample1).tconfint_mean()[0], ymax=sms.DescrStatsW(sample1).tconfint_mean()[1], colors='blue') plt.vlines(x=0.75, ymin=sms.DescrStatsW(sample2).tconfint_mean()[0], ymax=sms.DescrStatsW(sample2).tconfint_mean()[1], colors='red') plt.xlim(0,1) plt.xticks(ticks=[],labels='') plt.show() # Question 4: Is there a significant effect of IV on DV? (age ~ marital) # If so, in which group? # 1st step, check the level of DV, more than 2 levels mar1 = df['age'][df.marital=='single'] mar2 = df['age'][df.marital=='divorced'] mar3 = df['age'][df.marital=='married'] # Check Normality stats.shapiro(df['age']) # Try data transformation, but now no answer stats.shapiro(np.log(df['age'])) stats.shapiro(np.sqrt(df['age'])) # Check Homogeneity stats.levene(mar1,mar2,mar3) # Check degree of freedom to report result model = ols('age ~ marital', data=df).fit() model.summary() # Use Kruskal-Wallis as non-parametric alternative if there is violation of parametric assumption stats.kruskal(mar1,mar2,mar3) # Run Tukey HSD tell where the difference is thsd = pairwise_tukeyhsd(df['age'], df['marital']) # Print it print(thsd.summary()) # Alternative use Dunn test, create a second dataframe to perform the Dunn PostHoc test df2=df[['IV','DV']] df2.columns = ['groups','values'] sp.posthoc_dunn(df2, val_col='values', group_col='groups') # See it row by row test1 = sp.posthoc_dunn(df2, val_col='values', group_col='groups') test1.iloc[0] test1.iloc[1] # Question 5: Is there a correlation between two groups? (age and duration in this case) # Check bivariate normality stats.shapiro(df['balance']) stats.shapiro(df['duration']) # Calculate Pearson R if there is no violation stats.pearsonr(df['balance'], df['duration']) # Calculate Spearman Rho stats.spearmanr(df['balance'], df['duration']) # Question 6: Draw Regression Plot, Set balance as Y-axis (balance and duration, and y is balance) sns.lmplot('age', 'balance', data=df, line_kws={'color': 'red'}) plt.show() # Question 7: Create multiple linear regression (balance ~ age + education) example model = ols('balance ~ age + education', data=df).fit() model.summary() # Question 8: Upload diagnostic plot for lm model_norm_residuals = model.get_influence().resid_studentized_internal QQ = ProbPlot(model_norm_residuals) QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1) plt.title('Normal Q-Q') plt.xlabel('Theoretical Quantiles') plt.ylabel('Standardized Residuals') plt.show() |
Direct link: https://paste.plurk.com/show/0YqcEXDLNwIKch6jvx4l