Source code for unipy.stats.feature_selection

"""Feature selection.

"""


# import numba as nb
import numpy as np
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
from unipy.stats.formula import from_formula
from unipy.stats.metrics import vif


__all__ = ['lasso_rank',
           'feature_selection_vif']


# Defining a Lasso generic function
def _lasso_for_loop(data, X=None, y=None, alpha=.0001, *args, **kwargs):

    # Fit to the model
    lassoReg = Lasso(alpha=alpha, fit_intercept=True,
                     normalize=True, precompute=False,
                     max_iter=1e5, tol=1e-7,
                     warm_start=False, positive=False,
                     selection='cyclic', *args, **kwargs)

    lassoReg.fit(data[X], data[y].squeeze())
    yPredict = lassoReg.predict(data[X])

    # Return the result in pre-defined format
    rss = np.sum((yPredict - data[y].squeeze()) ** 2)
    ret = [rss]
    ret.extend([lassoReg.intercept_])
    ret.extend(lassoReg.coef_)

    return ret, yPredict


[docs]def lasso_rank(formula=None, X=None, y=None, data=None,
               alpha=np.arange(1e-5, 1e-2, 1e-4), k=2, plot=False,
               *args, **kwargs):
    """Feature selection by LASSO regression.

    Parameters
    ----------
    formula:
        R-style formula string

    X: list-like
        Column values for X.

    y: list-like
        A column value for y.

    data: pandas.DataFrame
        A DataFrame.

    alpha: Iterable
        An Iterable contains alpha values.
    k: int
        Threshold of coefficient matrix

    plot: Boolean (default: False)
        True if want to plot the result.

    Returns
    -------
    rankTbl: pandas.DataFrame
        Feature ranking by given ``k``.

    minIntercept: pandas.DataFrame
        The minimum intercept row in coefficient matrix.

    coefMatrix: pandas.DataFrame
        A coefficient matrix.

    kBest: pandas.DataFrame
        When Given ``k``, The best intercept row in coefficient matrix.

    kBestPredY: dict
        A predicted ``Y`` with ``kBest`` alpha.

    Example
    -------
    >>> import unipy.dataset.api as dm
    >>> dm.init()
    ['cars', 'anscombe', 'iris', 'nutrients', 'german_credit_scoring_fars2008', 'winequality_red', 'winequality_white', 'titanic', 'car90', 'diabetes', 'adult', 'tips', 'births_big', 'breast_cancer', 'air_quality', 'births_small']
    >>> wine_red = dm.load('winequality_red')
    Dataset : winequality_red
    >>>
    >>> ranked, best_by_intercept, coefTbl, kBest, kBestPred = lasso_rank(X=wine_red.columns.drop('quality'), y=['quality'], data=wine_red)
    >>> ranked
                      rank  lasso_coef  abs_coef
    volatile_acidity     1   -0.675725  0.675725
    alcohol              2    0.194865  0.194865
    >>> best_by_intercept
                          RSS  Intercept  fixed_acidity  volatile_acidity  \
    alpha_0.00121  691.956364   3.134874       0.002374         -1.023793

                   citric_acid  residual_sugar  chlorides  free_sulfur_dioxide  \
    alpha_0.00121          0.0             0.0  -0.272912                 -0.0

                   total_sulfur_dioxide  density   pH  sulphates   alcohol  \
    alpha_0.00121             -0.000963     -0.0 -0.0   0.505956  0.264552

                   var_count
    alpha_0.00121          6
    >>>
    """
    if formula is not None:
        X, y = from_formula(formula)
    else:
        X = list(X)
        y = y

    # Iterate over the alpha values
    coefMatrix = {'alpha_%.5f' % a: _lasso_for_loop(data, X=X, y=y, alpha=a, *args, **kwargs)[0] for a in alpha}
    predict    = {'alpha_%.5f' % a: _lasso_for_loop(data, X=X, y=y, alpha=a, *args, **kwargs)[1] for a in alpha}

    coefMatrix = pd.DataFrame(coefMatrix).T
    coefMatrix.columns = ['RSS', 'Intercept'] + X
    coefMatrix['var_count'] = coefMatrix.apply(np.count_nonzero, axis=1) - 2

    # Filter by thresh >= var_count
    kBest = coefMatrix[coefMatrix['var_count'] <= k]
    kBest = kBest.loc[kBest[['var_count']].idxmax()]
    kBest = kBest.loc[kBest[['Intercept']].idxmin()]

    # Minumum Intercept
    minIntercept = coefMatrix.loc[coefMatrix[['Intercept']].idxmin()]

    # Get Predicted Y value
    alphaVal = kBest.index[0]
    kBestPredY = {alphaVal: predict[alphaVal]}

    # Get a Rank Table
    lassoVal = kBest.iloc[:, kBest.squeeze().nonzero()[0].tolist()[2:-1]]
    filteredTbl = pd.concat([lassoVal.T, abs(lassoVal).T], axis=1)
    filteredTbl.columns = ['lasso_coef', 'abs_coef']
    filteredTbl = filteredTbl.sort_values(by='abs_coef', ascending=False)
    filteredTbl['rank'] = range(1, len(filteredTbl) + 1)
    rankTbl = filteredTbl[['rank', 'lasso_coef', 'abs_coef']]

    # Plots
    #fig = plt.figure(figsize=(12, 9))
    #title = 'Top {} variables : absolute coefficient by Lasso'.format(len(filteredTbl))
    #rankTbl['abs_coef'].plot(kind='barh')
    #fig.suptitle(title, fontsize=14, fontweight='bold')
    #plt.tight_layout(pad=5)

    return rankTbl, minIntercept, coefMatrix, kBest, kBestPredY


[docs]def feature_selection_vif(data, thresh=5.0):
    '''Stepwise Feature Selection for multivariate analysis.

    It calculates OLS regressions and the variance inflation factors iterating
    all explanatory variables. If the maximum VIF of a variable is over the
    given threshold, It will be dropped. This process is repeated until all
    VIFs are lower than the given threshold.

    Recommended threshold is lower than 5, because if VIF is greater than 5,
    then the explanatory variable selected is highly collinear with the other
    explanatory variables, and the parameter estimates will have large standard
    errors because of this.

    Parameters
    ----------
    data : DataFrame, (rows: observed values, columns: multivariate variables)
        design dataframe with all explanatory variables, as for example used in
        regression

    thresh : int, float
        A threshold of VIF

    Returns
    -------
    Filtered_data : DataFrame
        A subset of the input DataFame

    dropped_List : DataFrame
        'var' column : dropped variable names from input data columns
        'vif' column : variance inflation factor of dropped variables

    Notes
    -----
    This function does not save the auxiliary regression.

    See Also
    --------
    statsmodels.stats.outliers_influence.variance_inflation_factor

    References
    ----------
    http://en.wikipedia.org/wiki/Variance_inflation_factor

    '''
    assert isinstance(data, pd.DataFrame)

    # Create Dropped variable list
    dropped = pd.DataFrame(columns=['var', 'vif'])

    # Startswith 'drop = True'(Assume that some variables will be dropped)
    dropCondition = True

    # Calculate a VIF & Drop columns(variables)
    while dropCondition:

        # 1. Calculate a VIF
        vifDict = {col: vif(data.loc[:, col], data.loc[:, data.columns != col])
                   for col in data.columns}

        # Get the MAXIMUM VIF
        maxVar = max(vifDict, key=vifDict.get)
        maxVal = vifDict[maxVar]

        # 2. IF VIF values are over the threshold, THEN drop it
        if maxVal >= thresh:

            # Keep it
            dropped = dropped.append({'var': maxVar, 'vif': maxVal},
                                     ignore_index=True)

            # Drop it
            data = data.drop(maxVar, axis=1)

            # Print it
            print("Dropping '" + str(maxVar) + "' " + " VIF: " + str(maxVal))

            # Since a variable has been dropped, the assumption remains
            dropCondition = True

        else:

            # No variable dropped, the assumption has been rejected
            dropCondition = False

    # Print Massages
    remainsMsg = '# Remaining Variables '
    msgWrapper = '-' * (len(remainsMsg)+1)

    print('\n' + msgWrapper + '\n' + remainsMsg + '\n' + msgWrapper)
    print(list(data.columns))
    print('\n')

    droppedMsg = '# Dropped Variables '
    msgWrapper = '-' * (len(remainsMsg)+1)
    print('\n' + msgWrapper + '\n' + droppedMsg + '\n' + msgWrapper)
    print(list(dropped.loc[:, 'var']))
    print('\n')

    return data, dropped