import scipy.stats as stats

import seaborn as sns
import pandas as pd
import numpy as np
dataset=sns.load_dataset('tips')

dataset.head()

dataset_table=pd.crosstab(dataset['sex'],dataset['smoker'])
print(dataset_table)

smoker  Yes  No
sex            
Male     60  97
Female   33  54

dataset_table.values

array([[60, 97],
       [33, 54]], dtype=int64)

#Observed Values
Observed_Values = dataset_table.values 
print("Observed Values :-\n",Observed_Values)

Observed Values :-
 [[60 97]
 [33 54]]

val=stats.chi2_contingency(dataset_table)
val

Chi2ContingencyResult(statistic=0.0, pvalue=1.0, dof=1, expected_freq=array([[59.84016393, 97.15983607],
       [33.15983607, 53.84016393]]))

Expected_Values=val[3]

no_of_rows=len(dataset_table.iloc[0:2,0])
no_of_columns=len(dataset_table.iloc[0,0:2])
ddof=(no_of_rows-1)*(no_of_columns-1)
print("Degree of Freedom:-",ddof)
alpha = 0.05

Degree of Freedom:- 1

from scipy.stats import chi2
chi_square=sum([(o-e)**2./e for o,e in zip(Observed_Values,Expected_Values)])
chi_square_statistic=chi_square[0]+chi_square[1]

print("chi-square statistic:-",chi_square_statistic)

chi-square statistic:- 0.001934818536627623

critical_value=chi2.ppf(q=1-alpha,df=ddof)
print('critical_value:',critical_value)

critical_value: 3.841458820694124

#p-value
p_value=1-chi2.cdf(x=chi_square_statistic,df=ddof)
print('p-value:',p_value)
print('Significance level: ',alpha)
print('Degree of Freedom: ',ddof)
print('p-value:',p_value)

p-value: 0.964915107315732
Significance level:  0.05
Degree of Freedom:  1
p-value: 0.964915107315732

if chi_square_statistic>=critical_value:
    print("Reject H0,There is a relationship between 2 categorical variables")
else:
    print("Retain H0,There is no relationship between 2 categorical variables")
    
if p_value<=alpha:
    print("Reject H0,There is a relationship between 2 categorical variables")
else:
    print("Retain H0,There is no relationship between 2 categorical variables")

Retain H0,There is no relationship between 2 categorical variables
Retain H0,There is no relationship between 2 categorical variables

ages=[10,20,35,50,28,40,55,18,16,55,30,25,43,18,30,28,14,24,16,17,32,35,26,27,65,18,43,23,21,20,19,70]
len(ages)

32

import numpy as np
ages_mean=np.mean(ages)
print(ages_mean)

30.34375

## Lets take sample

sample_size=10
age_sample=np.random.choice(ages,sample_size)
age_sample

array([23, 18, 10, 19, 20, 32, 16, 24, 10, 35])

from scipy.stats import ttest_1samp
ttest,p_value=ttest_1samp(age_sample,30)
print(p_value)

0.006006021202253093

if p_value < 0.05:    # alpha value is 0.05 or 5%
    print(" we are rejecting null hypothesis")
else:
    print("we are accepting null hypothesis")

 we are rejecting null hypothesis

import numpy as np
import pandas as pd
import scipy.stats as stats
import math
np.random.seed(6)
school_ages=stats.poisson.rvs(loc=18,mu=35,size=1500)
classA_ages=stats.poisson.rvs(loc=18,mu=30,size=60)

classA_ages.mean()

46.9

_,p_value=stats.ttest_1samp(a=classA_ages,popmean=school_ages.mean())
p_value

1.139027071016194e-13

school_ages.mean()

53.303333333333335

if p_value < 0.05:    # alpha value is 0.05 or 5%
    print(" we are rejecting null hypothesis")
else:
    print("we are accepting null hypothesis")

 we are rejecting null hypothesis

weight1=[25,30,28,35,28,34,26,29,30,26,28,32,31,30,45]
weight2=weight1+stats.norm.rvs(scale=5,loc=-1.25,size=15)

print(weight1)
print(weight2)

[25, 30, 28, 35, 28, 34, 26, 29, 30, 26, 28, 32, 31, 30, 45]
[23.97614363 26.85249356 24.1373696  27.16615209 32.74902935 37.51232196
 18.81017209 33.31296285 30.20472761 25.47138779 21.60004812 34.32284878
 36.99043308 30.7551601  40.64860465]

weight_df=pd.DataFrame({"weight_10":np.array(weight1),
                         "weight_20":np.array(weight2),
                       "weight_change":np.array(weight2)-np.array(weight1)})

weight_df

_,p_value=stats.ttest_rel(a=weight1,b=weight2)

print(p_value)

0.4858219692122552

if p_value < 0.05:    # alpha value is 0.05 or 5%
    print(" we are rejecting null hypothesis")
else:
    print("we are accepting null hypothesis")

we are accepting null hypothesis

import seaborn as sns
df=sns.load_dataset('iris')
df.head()

df.shape

(150, 5)

df.drop(["species"],axis=1).corr()

sns.pairplot(df)

<seaborn.axisgrid.PairGrid at 0x23f97b7db80>

import seaborn as sns
df1=sns.load_dataset('iris')

df1.head()

df_anova = df1[['petal_width','species']]

grps = pd.unique(df_anova.species.values)

grps

array(['setosa', 'versicolor', 'virginica'], dtype=object)

d_data = {grp:df_anova['petal_width'][df_anova.species == grp] for grp in grps}

d_data

{'setosa': 0     0.2
 1     0.2
 2     0.2
 3     0.2
 4     0.2
 5     0.4
 6     0.3
 7     0.2
 8     0.2
 9     0.1
 10    0.2
 11    0.2
 12    0.1
 13    0.1
 14    0.2
 15    0.4
 16    0.4
 17    0.3
 18    0.3
 19    0.3
 20    0.2
 21    0.4
 22    0.2
 23    0.5
 24    0.2
 25    0.2
 26    0.4
 27    0.2
 28    0.2
 29    0.2
 30    0.2
 31    0.4
 32    0.1
 33    0.2
 34    0.2
 35    0.2
 36    0.2
 37    0.1
 38    0.2
 39    0.2
 40    0.3
 41    0.3
 42    0.2
 43    0.6
 44    0.4
 45    0.3
 46    0.2
 47    0.2
 48    0.2
 49    0.2
 Name: petal_width, dtype: float64,
 'versicolor': 50    1.4
 51    1.5
 52    1.5
 53    1.3
 54    1.5
 55    1.3
 56    1.6
 57    1.0
 58    1.3
 59    1.4
 60    1.0
 61    1.5
 62    1.0
 63    1.4
 64    1.3
 65    1.4
 66    1.5
 67    1.0
 68    1.5
 69    1.1
 70    1.8
 71    1.3
 72    1.5
 73    1.2
 74    1.3
 75    1.4
 76    1.4
 77    1.7
 78    1.5
 79    1.0
 80    1.1
 81    1.0
 82    1.2
 83    1.6
 84    1.5
 85    1.6
 86    1.5
 87    1.3
 88    1.3
 89    1.3
 90    1.2
 91    1.4
 92    1.2
 93    1.0
 94    1.3
 95    1.2
 96    1.3
 97    1.3
 98    1.1
 99    1.3
 Name: petal_width, dtype: float64,
 'virginica': 100    2.5
 101    1.9
 102    2.1
 103    1.8
 104    2.2
 105    2.1
 106    1.7
 107    1.8
 108    1.8
 109    2.5
 110    2.0
 111    1.9
 112    2.1
 113    2.0
 114    2.4
 115    2.3
 116    1.8
 117    2.2
 118    2.3
 119    1.5
 120    2.3
 121    2.0
 122    2.0
 123    1.8
 124    2.1
 125    1.8
 126    1.8
 127    1.8
 128    2.1
 129    1.6
 130    1.9
 131    2.0
 132    2.2
 133    1.5
 134    1.4
 135    2.3
 136    2.4
 137    1.8
 138    1.8
 139    2.1
 140    2.4
 141    2.3
 142    1.9
 143    2.3
 144    2.5
 145    2.3
 146    1.9
 147    2.0
 148    2.3
 149    1.8
 Name: petal_width, dtype: float64}

F, p = stats.f_oneway(d_data['setosa'], d_data['versicolor'], d_data['virginica'])

print(p)

4.169445839443116e-85

if p<0.05:
    print("reject null hypothesis")
else:
    print("accept null hypothesis")

reject null hypothesis

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

	sepal_length	sepal_width	petal_length	petal_width
sepal_length	1.000000	-0.117570	0.871754	0.817941
sepal_width	-0.117570	1.000000	-0.428440	-0.366126
petal_length	0.871754	-0.428440	1.000000	0.962865
petal_width	0.817941	-0.366126	0.962865	1.000000

Chi-Square Test-¶

T Test¶

One-sample T-test with Python¶

Some More Examples¶

Two-sample T-test With Python¶

Paired T-test With Python¶

Correlation¶

Anova Test(F-Test)¶

One Way F-test(Anova) :-¶

	weight_10	weight_20	weight_change
0	25	23.976144	-1.023856
1	30	26.852494	-3.147506
2	28	24.137370	-3.862630
3	35	27.166152	-7.833848
4	28	32.749029	4.749029
5	34	37.512322	3.512322
6	26	18.810172	-7.189828
7	29	33.312963	4.312963
8	30	30.204728	0.204728
9	26	25.471388	-0.528612
10	28	21.600048	-6.399952
11	32	34.322849	2.322849
12	31	36.990433	5.990433
13	30	30.755160	0.755160
14	45	40.648605	-4.351395

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa