Source code for unipy.stats.hypo_test

"""Statistical Hypothesis Tests.

"""


import warnings
import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels.stats.api as smt
import statsmodels.formula.api as smf


__all__ = ['f_test',
           'f_test_formula',
           'anova_test',
           'anova_test_formula',
           'chisq_test',
           'fisher_test']


[docs]def f_test(a, b, scale=1, alternative='two-sided', conf_level=.95, *args, **kwargs): """F-Test. """ assert 0 < scale <= 1 assert 0 < conf_level <= 1 dfn = len(a) - 1 dfd = len(b) - 1 if (len(a) or len(b)) < 2: print('Observations are insufficient.') f_statistics = np.nan p_value = np.nan conf_min, conf_max = np.nan, np.nan else: try: f_statistics = np.array(a).var() / np.array(b).var() except ZeroDivisionError: f_statistics = np.nan p_value = st.f.cdf(np.abs(f_statistics), dfn, dfd, scale=scale) # p_value = distributions.t.sf(np.abs(t), df) conf_interval = st.f.interval(conf_level, dfn, dfd, scale=scale) conf_min, conf_max = np.multiply(f_statistics, conf_interval) if alternative == 'two-sided': p_value = 2 * np.min([p_value, 1-p_value]) elif alternative == 'greater': p_value = st.f.sf(f_statistics, dfn, dfd, scale=scale) conf_min, conf_max = conf_min, np.inf elif alternative == 'less': conf_min, conf_max = 0, conf_max # print('F-Statistics: %.4g' % f_statistics) # print('dfn, dfd: %d %d' % (dfn, dfd) ) # print('Hypothesized Scale: %d' % scale) # print('Confidence Level: %.3g' % conf_level) # print('Confidence Interval: %.4g, %.4g' % (conf_min, conf_max)) # print('P-value: %.4g' % p_value) res = pd.Series({'f_statistics': f_statistics, 'dfn': dfn, 'dfd': dfd, 'hypo_scale': scale, 'conf_level': conf_level, 'conf_interval': (conf_min, conf_max), 'p_value': p_value}) return res
[docs]def f_test_formula(a, b, scale=1, alternative='two-sided', conf_level=.95, *args, **kwargs): """F-Test by formula. """ assert 0 < scale <= 1 assert 0 < conf_level <= 1 dfn = len(a) - 1 dfd = len(b) - 1 if (len(a) or len(b)) < 2: print('Observations are insufficient.') f_statistics = np.nan p_value = np.nan conf_min, conf_max = np.nan, np.nan else: try: f_statistics = np.array(a).var() / np.array(b).var() except ZeroDivisionError: f_statistics = np.nan p_value = st.f.cdf(np.abs(f_statistics), dfn, dfd, scale=scale) # p_value = distributions.t.sf(np.abs(t), df) conf_interval = st.f.interval(conf_level, dfn, dfd, scale=scale) conf_min, conf_max = np.multiply(f_statistics, conf_interval) if alternative == 'two-sided': p_value = 2 * np.min([p_value, 1-p_value]) elif alternative == 'greater': p_value = st.f.sf(f_statistics, dfn, dfd, scale=scale) conf_min, conf_max = conf_min, np.inf elif alternative == 'less': conf_min, conf_max = 0, conf_max # print('F-Statistics: %.4g' % f_statistics) # print('dfn, dfd: %d %d' % (dfn, dfd) ) # print('Hypothesized Scale: %d' % scale) # print('Confidence Level: %.3g' % conf_level) # print('Confidence Interval: %.4g, %.4g' % (conf_min, conf_max)) # print('P-value: %.4g' % p_value) res = pd.Series({'f_statistics': f_statistics, 'dfn': dfn, 'dfd': dfd, 'hypo_scale': scale, 'conf_level': conf_level, 'conf_interval': (conf_min, conf_max), 'p_value': p_value}) return res
[docs]def anova_test(formula, data=None, typ=1): """ANOVA Test. """ lin_model = smf.ols(formula, data=data).fit() res_anova = smt.anova_lm(lin_model, typ=typ) return res_anova, lin_model
[docs]def anova_test_formula(formula, data=None, typ=1): """ANOVA Test by formula. """ lin_model = smf.ols(formula, data=data).fit() res_anova = smt.anova_lm(lin_model, typ=typ) return res_anova, lin_model
[docs]def chisq_test(data, x=None, y=None, correction=None, lambda_=None, margin=True, print_ok=True): """Chi-square Test. ``lambda_`` gives the power in the Cressie-Read power divergence statistic. The default is 1. For convenience, lambda_ may be assigned one of the following strings, in which case the corresponding numerical value is used: Parameters ---------- data: pandas.DataFrame x: str (default: None) y: str (default: None) correction: (default: None) lambda_: lambda (default: None) margin: Boolean (default: True) print_ok: Boolean (default: True) """ dataChi = data[[x, y]].dropna() crossT = pd.crosstab(dataChi[x], dataChi[y], margins=margin) if margin: crossT_raw = crossT.iloc[:-1, :-1] (chiSqr, pValue, freeDeg, expectedV_raw) = st.chi2_contingency(crossT_raw, correction=correction, lambda_=lambda_) expectedV_raw = pd.DataFrame(expectedV_raw, index=crossT_raw.index, columns=crossT_raw.columns) expectedV = expectedV_raw.copy() expectedV['All'] = expectedV.sum(axis=1) expectedV = expectedV.append(expectedV.sum(axis=0).rename('All')) else: (chiSqr, pValue, freeDeg, expectedV_raw) = st.chi2_contingency(crossT, correction=correction, lambda_=lambda_) expectedV_raw = pd.DataFrame(expectedV_raw, index=crossT.index, columns=crossT.columns) expectedV = expectedV_raw check_chisq_rule = (expectedV_raw[expectedV_raw <= 5].count().sum() <= (expectedV_raw.shape[0] * expectedV_raw.shape[1]) * .2) if print_ok: print('#'*20 + ' Chi-Square Test ' + '#'*20 + '\n') print('x : ' + x) print('y : ' + y) print('\n## Contingency Table') print('-'*50) print(crossT) print('-'*50) msg = """ ## Chi-square Statistic : {} ## P-value : {} ## Degrees of Freedom : {} """ print(msg.format(chiSqr, pValue, freeDeg)) print('\n' + '## Expected Values') print('-'*60) print(expectedV) print('-'*60 + '\n') if not check_chisq_rule: w_msg1 = 'Warning: The precondition of Chi-Square was not implemented.' w_msg2 = 'Chi-squared approximation may be incorrect.' w_msg = '\n'.join([w_msg1, w_msg2]) warnings.warn(w_msg) return pd.Series({'cross_t': crossT, 'chisq-stat': chiSqr, 'p_value': pValue, 'df': freeDeg, 'expected': expectedV})
[docs]def fisher_test(data, x=None, y=None, alternative='two-sided', margin=True, print_ok=True): """Fisher's Exact Test. """ dataChi = data[[x, y]].dropna() crossT = pd.crosstab(dataChi[x], dataChi[y], margins=margin) if margin: crossT_raw = crossT.iloc[:-1, :-1] odds_ratio, p_value = st.fisher_exact(crossT_raw, alternative=alternative) else: odds_ratio, p_value = st.fisher_exact(crossT, alternative=alternative) if print_ok: print('#'*20 + ' Fisher\'s Exact Test ' + '#'*20 + '\n') print('x : ' + x) print('y : ' + y) print('\n## Contingency Table') print('-'*50) print(crossT) print('-'*50) msg = """ ## Odds Ratio : {} ## P-value : {} ## Alternative Hypothesis : {} """ print(msg.format(odds_ratio, p_value, alternative)) print('-'*60 + '\n') return pd.Series({'odds_ratio': odds_ratio, 'p_value': p_value})
if __name__ == '__main__': a = [65,75,83,75,89,95,80,69,76,79,81,90,75,72,83,92,61,87,62.80] b = [54,65,91,76,79,78,84,97,88,75,72,68,76,85,72,88] # %% f_test(a, b) f_test(a, b, alternative='greater') f_test(a, b, alternative='less')