Source code for unipy.stats.metrics

"""Metric Functions.

"""


import numpy as np
import pandas as pd
import statsmodels.api as sm
import itertools as it
import scipy.stats as st
from sklearn.preprocessing import PolynomialFeatures as pnf


__all__ = ['deviation',
           'vif',
           'mean_absolute_percentage_error',
           'average_absolute_deviation',
           'median_absolute_deviation',
           'calculate_interaction']


[docs]def deviation(container, method='mean', if_abs=True):

    """Deviation.

    """
    if method == 'mean':
        center = np.nanmean(container)
    elif method == 'median':
        center = np.nanmedian(container)

    resIter = map(lambda x: x - center, container)

    if if_abs:
        resIter = map(np.absolute, resIter)

    res = np.fromiter(resIter, dtype=np.float)

    return res


[docs]def vif(y, X):
    """Variance inflation factor.

    """
    assert isinstance(y, pd.Series)
    assert isinstance(X, pd.DataFrame)

    # Change input to array
    y_arr = y.values
    X_arr = X.values

    # Calculate a linear regression(Ordinary Least Square)
    reg = sm.add_constant(X_arr)
    est = sm.OLS(y_arr, reg).fit()

    # Get a R-square
    rsq = est.rsquared

    # Get a VIF
    vif = 1 / (1 - rsq)

    return vif


[docs]def mean_absolute_percentage_error(measure, predict, thresh=3.0):
    '''Mean Absolute Percentage Error.
    It is a percent of errors.
    It measures the prediction accuracy of a forecasting method in Statistics
    with the real mesured values and the predicted values, for example in trend
    estimation.
    If MAPE is 5, it means this prediction method potentially has 5% error.
    It cannot be used if there are zero values,
    because there would be a division by zero.
    '''
    mape = np.mean(np.absolute((measure - predict) / measure)) * 100

    return mape


[docs]def average_absolute_deviation(measure, predict, thresh=2):
    '''Average Absolute Deviation.
    It is ...
    It measures the prediction accuracy of a forecasting method in Statistics
    with the real mesured values and the predicted values, for example in trend
    estimation.
    If MAD is 5, it means this prediction method potentially has...
    '''
    aad = np.mean(np.absolute(measure - predict))

    return aad


[docs]def median_absolute_deviation(measure, predict, thresh=2):
    '''Median Absolute Deviation.
    It is ...
    It measures the prediction accuracy of a forecasting method in Statistics
    with the real mesured values and the predicted values, for example in trend
    estimation.
    If MAD is 5, it means this prediction method potentially has...
    '''
    mad = np.median(np.absolute(measure - predict))

    return mad


[docs]def calculate_interaction(rankTbl, pvTbl, target, ranknum=10):
    """Feature interaction calculation.

    """
    rankTop = rankTbl[:ranknum]
    interPvt = pvTbl[rankTop['var_name']]
    interAct = pnf(degree=2, interaction_only=True)

    interTbl = pd.DataFrame(interAct.fit_transform(interPvt),
                            index=interPvt.index).iloc[:, 1:]
    rankTop_col = list(rankTop['var_name'])
    interAct_col = list(map(' xx '.join,
                        list(it.combinations(rankTop['var_name'], 2))))
    interTbl.columns = rankTop_col + interAct_col

    # Generate a Result Table
    col = ['slope', 'intercept', 'corr_coef', 'p_value', 'std_err']
    ind = interTbl.columns
    regMatrix = pd.DataFrame(index=ind, columns=col)

    # Regression
    Y = pvTbl[target]
    for _ in range(interTbl.shape[1]):
        x = interTbl.ix[:, _]
        regMatrix.iloc[_, ] = st.linregress(x, Y)

    regMatrix['abs_corr_coef'] = abs(regMatrix['corr_coef'])
    regMatrix.sort_values(by='p_value', ascending=True, inplace=True)

    rank = regMatrix[(regMatrix['p_value'] < .01) &
                     (regMatrix['abs_corr_coef'] >= .3)]

    rank = rank.reset_index()
    rank['inter_name'] = rank['index']
    rank = rank[rank['inter_name'].str.find(' xx ') != -1]
    rank['rank'] = range(1, len(rank) + 1)

    rankCol = ['rank', 'inter_name', 'p_value',
               'corr_coef', 'abs_corr_coef',
               'std_err', 'slope', 'intercept']
    rank = rank[rankCol]

    return rank, regMatrix, interTbl