Source code for bs_ds.bamboo

# -*- coding: utf-8 -*-
"""Collection of DataFrame inspection, styling, and EDA plotting."""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import scipy.stats as sts
from IPython.display import display

[docs]def big_pandas(user_options=None,verbose=0):
    """Changes the default pandas display setttings to show all columns and all rows.
    User may replace settings with a kwd dictionary matching available options.
    """
    import pandas as pd
    if user_options==None:
        options = {
            'display' : {
                'max_columns' : None,
                'expand_frame_repr':False,
                'max_rows':None,
                'max_info_columns':500,
                'precision' : 4,
            }
        }
    else:
        options = user_options

    for category, option in options.items():
        for op, value in option.items():
            pd.set_option(f'{category}.{op}', value)  # Python 3.6+
            if verbose>0:
                print(f'{category}.{op}={value}')
    return options


[docs]def reset_pandas():
    """Resets all pandas options back to default state."""
    return pd.reset_option('all')


[docs]def ignore_warnings():
    """Ignores all deprecation warnings (future,and pending categories too)."""
    import warnings
    return warnings.filterwarnings('ignore')#simplefilter(action='ignore', category=(FutureWarning,DeprecationWarning,PendingDeprecationWarning))

[docs]def reset_warnings():
    """Restore the default warnings settings"""
    import warnings
    return warnings.simplefilter(action='default')#, category=(FutureWarning,DeprecationWarning,PendingDeprecationWarning))


# def list2df(list, index_col=None):#, sort_values='index'):
#     """ Quick turn an appened list with a header (row[0]) into a pretty dataframe.
#     Ex: list_results = [["Test","N","p-val"]] #... (some sort of analysis performed to produce results)
#         list_results.append([test_Name,length(data),p])
#         list2df(list_results)
#     """
#     # with pd.option_context("display.max_rows", None, "display.max_columns", None ,
#     # 'display.precision',3,'display.notebook_repr_htm',True):

#     df_list = pd.DataFrame(list[1:],columns=list[0])
#     if index_col==None:
#         return df_list
#     else:
#         df_list.reset_index(inplace=True)
#         df_list.set_index(index_col, inplace=True)
#     return df_list


[docs]def check_df_for_columns(df, columns=None):

    """
    Checks df for presence of columns.

    args:
    **********
    df: pd.DataFrame to find columns in
    columns: str or list of str. column names
    """
    if not columns:
        print('check_df_for_columns expected to be passed a list of column names.')
    else:
        for column in columns:
            if not column in df.columns:
                continue
            else:
                print(f'{column} is a valid column name')
    pass


[docs]def check_unique(df, columns=None):
    """
    Prints unique values for all columns in dataframe. If passed list of columns,
    it will only print results for those columns
    8************  >
    Params:
    df: pandas DataFrame, or pd.Series
    columns: list containing names of columns (strings)

    Returns: None
        prints values only
    """
    # check for columns
#     if columns is None:
        # Check if series, even though this is unnecesary because you could simply
        # Call pd.series.sort_values()
    if isinstance(df, pd.Series):
        # display all the value counts
        nunique = df.nunique()
        print(f'\n---------------------------\n')
        print(f"{df.name} Type: {df.dtype}\nNumber unique values: {nunique}")
        return pd.DataFrame(df.value_counts())

    else:
        if columns is None:
            columns = df.columns

        for col in columns:
            nunique = df[col].nunique()
            unique_df = pd.DataFrame(df[col].value_counts())
            print(f'\n---------------------------')
            print(f"\n{col} Type: {df[col].dtype}\nNumber unique values: {nunique}.")
            display(unique_df)
        pass


[docs]def check_numeric(df, columns=None, unique_check=False, return_list=False, show_df=False):

    """
    Iterates through columns and checks for possible numeric features labeled as objects.
    Params:
    ******************
    df: pandas DataFrame

    unique_check: bool. (default=True)
        If true, distplays interactive interface for checking unique values in columns.

    return_list: bool, (default=False)
        If True, returns a list of column names with possible numeric types.
    **********>
    Returns: dataframe displayed (always), list of column names if return_list=True
    """
    from .bs_ds import list2df

    display_list = [['Column', 'Numeric values','Total Values', 'Percent']]
    outlist = []
    # print(f'\n---------------------------------------------------\n')
    # print(f'# of Identified Numeric Values in "Object" columns:')

    # Check for user column list
    columns_to_check = []
    if columns == None:
        columns_to_check = df.columns
    else:
        columns_to_check = columns
    # Iterate through columns

    for col in columns_to_check:

        # Check for object dtype,
        if df[col].dtype == 'object':

            # If object, check for numeric
            if df[col].str.isnumeric().any():

                # If numeric, get counts
                vals = df[col].str.isnumeric().sum()
                percent = round((df[col].str.isnumeric().sum()/len(df[col]))*100, 2)
                display_list.append([col, vals,len(df[col]), percent])
                outlist.append(col)

    list2show = list2df(display_list)
    list2show.set_index('Column',inplace=True)

    styled_list2show = list2show.style.set_caption('# of Detected Numeric Values in "Object" columns:')
    if show_df==True:
        display(styled_list2show)

    if unique_check:
        unique = input("display unique values? (Enter 'y' for all columns, a column name, or 'n' to quit):")

        while unique != 'n':

            if unique == 'y':
                check_unique(df, outlist)
                break

            elif unique in outlist:
                name = [unique]
                check_unique(df, name)

            unique = input('Enter column name or n to quit:')

    if return_list==True:
        return styled_list2show, outlist
    else:
        return styled_list2show


[docs]def check_null(df, columns=None,show_df=False):
    """
    Iterates through columns and checks for null values and displays # and % of column.
    Params:
    ******************
    df: pandas DataFrame

    columns: list of columns to check
    **********>
    Returns: displayed dataframe
    """
    from .bs_ds import list2df
    display_list = [['Column', 'Null values', 'Total Values','Percent']]
    outlist = []
    # print(f'\n----------------------------\n')
    # print(f'# of Identified Null Values:')

    # Check for user column list
    columns_to_check = []
    if columns==None:
        columns_to_check = df.columns
    else:
        columns_to_check = columns
    # Iterate through columns

    for col in columns_to_check:

        # Check for object dtype,
        # if df[col].dtype == 'object':

        # If object, check for numeric


        # If numeric, get counts
        vals = df[col].isna().sum()
        percent = round((vals/len(df[col]))*100, 3)
        display_list.append([col, vals, len(df[col]), percent])
        outlist.append(col)

    list2show=list2df(display_list)
    list2show.set_index('Column',inplace=True)

    styled_list2show = list2show.style.set_caption('# of Identified Null Values:')
    if show_df==True:
        display(styled_list2show)

    return styled_list2show






[docs]def compare_duplicates(df1, df2, to_drop=True, verbose=True, return_names_list=False):
    """
    Compare two dfs for duplicate columns, drop if to_drop=True, useful
    to us before concatenating when dtypes are different between matching column names
    and df.drop_duplicates is not an option.
    Params:
    --------------------
    df1, df2 : pandas dataframe suspected of having matching columns
    to_drop : bool, (default=True)
        If True will give the option of dropping columns one at a time from either column.
    verbose: bool (default=True)
        If True prints column names and types, set to false and return_names list=True
        if only desire a list of column names and no interactive interface.
    return_names_list: bool (default=False),
        If True, will return a list of all duplicate column names.
    --------------------
    Returns: List of column names if return_names_list=True, else nothing.
    """
    catch = []
    dropped1 = []
    dropped2 = []
    if verbose:
        print("Column |   df1   |   df2   ")
        print("*----------------------*")

    # Loop through columns, inspect for duplicates
    for col in df1.columns:
        if col in df2.columns:
            catch.append(col)

            if verbose:
                print(f"{col}   {df1[col].dtype}   {df2[col].dtype}")

            # Accept user input and drop columns one by one
            if to_drop:
                choice = input("\nDrop this column? Enter 1. df1, 2. df2 or n for neither")

                if choice ==  "1":
                    df1.drop(columns=col, axis=1, inplace=True)
                    dropped1.append(col)

                elif choice == "2":
                    df2.drop(columns=col, axis=1, inplace=True)
                    dropped2.append(col)
                else:

                    continue
    # Display dropped columns and orignating df
    if to_drop:
        if len(dropped1) >= 1:
            print(f"\nDropped from df1:\n{dropped1}")
        if len(dropped2) >= 1:
            print(f"\nDropped from df1:\n{dropped2}")

    if return_names_list:
        return catch
    else:
        pass


## Dataframes styling
[docs]def check_column(panda_obj, columns=None,nlargest='all'):
    """
    Prints column name, dataype, # and % of null values, and unique values for the nlargest # of rows (by valuecount_.
    it will only print results for those columns
    ************
    Params:
    panda_object: pandas DataFrame or Series
    columns: list containing names of columns (strings)

    Returns: None
        prints values only
    """
    # Check for DF vs Series
    if type(panda_obj)==pd.core.series.Series:
        series=panda_obj
        print(f'\n----------------------------\n')
        print(f"Column: df['{series.name}']':")
        print(f"dtype: {series.dtype}")
        print(f"isna: {series.isna().sum()} out of {len(series)} - {round(series.isna().sum()/len(series)*100,3)}%")

        print(f'\nUnique non-na values:')
        if nlargest =='all':
            print(series.value_counts())
        else:
            print(series.value_counts().nlargest(nlargest))


    elif type(panda_obj)==pd.core.frame.DataFrame:
        df = panda_obj
        for col_name in df.columns:
            col = df[col_name]
            print("\n-----------------------------------------------")
            print(f"Column: df['{col_name}']':")
            print(f"dtype: {col.dtypes}")
            print(f"isna: {col.isna().sum()} out of {len(col)} - {round(col.isna().sum()/len(col)*100,3)}%")

            print(f'\nUnique non-na values:\nnlargest={nlargest}\n-----------------')
            if nlargest =='all':
                print(col.value_counts())
            else:
                print(col.value_counts().nlargest(nlargest))



    ## DataFrame Creation, Inspection, and Exporting
[docs]def inspect_df(df, n_rows=3, verbose=True):
    """ EDA:
    Show all pandas inspection tables.
    Displays df.head(), df.info(), df.describe().
    By default also runs check_null and check_numeric to inspect
    columns for null values and to check string columns to detect
    numeric values. (If verbose==True)
    Parameters:
        df(dataframe):
            dataframe to inspect
        n_rows:
            number of header rows to show (Default=3).
        verbose:
            If verbose==True (default), check_null and check_numeric.
    Ex: inspect_df(df,n_rows=4)
    """
    from bs_ds.bamboo import check_column, check_null, check_numeric, check_unique
    from bs_ds.prettypandas import display_side_by_side
    import pandas as pd

    with pd.option_context("display.max_columns", None ,'display.precision',4):
        display(df.info()) #, display(df.describe())

        if verbose == True:

            df_num = check_numeric(df,unique_check=False, show_df=False)
            # sdf_num = df_num.style.set_caption('Detected Numeric Values')

            df_null = check_null(df, show_df=False)
            # sdf_null = df_null.style.set_caption('Detected Null values')

            display_side_by_side(df_null, df_num,df.describe())
        else:
            display(df.describe())

        display(df.head(n_rows))





[docs]def drop_cols(df, list_of_strings_or_regexp,verbose=0):#,axis=1):
    """EDA: Take a df, a list of strings or regular expression and recursively
    removes all matching column names containing those strings or expressions.
    # Example: if the df_in columns are ['price','sqft','sqft_living','sqft15','sqft_living15','floors','bedrooms']
    df_out = drop_cols(df_in, ['sqft','bedroom'])
    df_out.columns # will output: ['price','floors']

    Parameters:
        DF --
            Input dataframe to remove columns from.
        regex_list --
            list of string patterns or regexp to remove.

    Returns:
        df_dropped -- input df without the dropped columns.
    """
    regex_list=list_of_strings_or_regexp
    df_cut = df.copy()
    for r in regex_list:
        df_cut = df_cut[df_cut.columns.drop(list(df_cut.filter(regex=r)))]
        if verbose>0:
            print(f'Removed {r}.')
    df_dropped = df_cut
    return df_dropped


[docs]def add_filtered_col_to_df(df_source, df_to_add_to, list_of_exps, return_filtered_col_names =False):
    """Takes a dataframe source with columns to copy using df.filter(regexp=(list_of_exps)),
    with list_of_exps being a list of text expressions to find inside column names."""
    import bs_ds as bs
    import pandas as pd
    filtered_col_list = {}
    for exp in list_of_exps:
        df_temp_filtered = df_source.filter(regex=(exp),axis=1).copy()
        filtered_col_list[exp]= list(df_temp_filtered.columns)

        df_to_add_to = pd.concat([df_to_add_to, df_temp_filtered])

    if return_filtered_col_names == False:
        return df_to_add_to
    else:
        print(filtered_col_list)
        return df_to_add_to, filtered_col_list


##
# EDA / Plotting Functions
[docs]def multiplot(df,annot=True,fig_size=None):
    """EDA: Plots results from df.corr() in a correlation heat map for multicollinearity.
    Returns fig, ax objects"""
    import seaborn as sns
    sns.set(style="white")
    from string import ascii_letters
    import numpy as np
    import pandas as pd
    import seaborn as sns
    import matplotlib.pyplot as plt


    # Compute the correlation matrix
    corr = df.corr()

    # Generate a mask for the upper triangle
    mask = np.zeros_like(corr, dtype=np.bool)
    idx = np.triu_indices_from(mask)
    mask[idx] = True

    # Set up the matplotlib figure
    if fig_size==None:
        figsize=(16,16)
    else:
        figsize = fig_size

    f, ax = plt.subplots(figsize=(figsize))

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr, mask=mask, annot=annot, cmap=cmap, center=0,

    square=True, linewidths=.5, cbar_kws={"shrink": .5})
    return f, ax

# Plots histogram and scatter (vs price) side by side
[docs]def plot_hist_scat(df, target=None, figsize=(12,9),fig_style='dark_background',font_dict=None,plot_kwds=None):
    """EDA: Great summary plots of all columns of a df vs target columne.
    Shows distplots and regplots for columns im datamframe vs target.
    Parameters:
        df (DataFrame):
            DataFrame.describe() columns will be plotted.
        target (string):
            Name of column containing target variable.assume first column.
        figsize (tuple):
            Tuple for figsize. Default=(12,9).
        fig_style:
            Figure style to use (in this context, will not change others in notebook).
            Default is 'dark_background'.
        font_dict:
            A keywork dictionry containing values for font properties under the following keys:
            - "fontTitle": font dictioanry for titles
            , fontAxis, fontTicks

    **plot_kwds:
        A kew_word dictionary containing any of the following keys for dictionaries containing
        any valid matplotlib key:value pairs for plotting:
            "hist_kws, kde_kws, line_kws,scatter_kws"
        Accepts any valid matplotlib key:value pairs passed by searborn to matplotlib.
        Subplot 1: hist_kws, kde_kws
        Subplot 2: line_kws,scatter_kws

    Returns:
        fig:
            Figure object.
        ax:
            Subplot axes with format ax[row,col].
            Subplot 1 = ax[0,0]; Subplot 2 = ax[0,1]
   """
    import matplotlib.ticker as mtick
    import matplotlib.pyplot as plt
    import seaborn as sns

    # Set target as first column if not specified
    if target==None:
        target= df.iloc[:,0]

    ###  DEFINE AESTHETIC CUSTOMIZATIONS  -------------------------------##
    # Checking for user font_dict, if not setting defaults:
    if font_dict == None:
        # Axis Label fonts
        fontTitle = {'fontsize': 16,
                   'fontweight': 'bold',
                    'fontfamily':'serif'}

        fontAxis = {'fontsize': 14,
                   'fontweight': 'bold',
                    'fontfamily':'serif'}

        fontTicks = {'fontsize': 12,
                   'fontweight':'bold',
                    'fontfamily':'serif'}

    else:

        if 'fontTitle' in font_dict.keys():
            fontTitle = font_dict['fontTitle']
        else:
            fontTitle = {'fontsize': 16, 'fontweight': 'bold','fontfamily':'serif'}

        if 'fontAxis' in font_dict.keys():
            fontAxis = font_dict['fontAxis']
        else:
            fontAxis = {'fontsize': 14,'fontweight': 'bold', 'fontfamily':'serif'}

        if 'fontTicks' in font_dict.keys():
            fontTicks = font_dict['fontTicks']
        else:
            fontTicks = {'fontsize': 12,'fontweight':'bold','fontfamily':'serif'}

    # Checking for user plot_kwds
    if plot_kwds == None:
        hist_kws = {"linewidth": 1, "alpha": 1, "color": 'steelblue','edgecolor':'w','hatch':'\\'}
        kde_kws = {"color": "white", "linewidth": 3, "label": "KDE",'alpha':0.7}
        line_kws={"color":"white","alpha":0.5,"lw":3,"ls":":"}
        scatter_kws={'s': 2, 'alpha': 0.8,'marker':'.','color':'steelblue'}

    else:
        kwds = plot_kwds
        # Define graphing keyword dictionaries for distplot (Subplot 1)
        if 'hist_kws' in kwds.keys():
            hist_kws = kwds['hist_kws']
        else:
            hist_kws = {"linewidth": 1, "alpha": 1, "color": 'steelblue','edgecolor':'w','hatch':'\\'}

        if 'kde_kws' in kwds.keys():
            kde_kws = kwds['kde_kws']
        else:
            kde_kws = {"color": "white", "linewidth": 3, "label": "KDE",'alpha':0.7}

        # Define the kwd dictionaries for scatter and regression line (subplot 2)
        if 'line_kws' in kwds.keys():
            line_kws = kwds['line_kws']
        else:
            line_kws={"color":"white","alpha":0.5,"lw":3,"ls":":"}

        if 'scatter_kws' in kwds.keys():
            scatter_kws = kwds['scatter_kws']
        else:
            scatter_kws={'s': 2, 'alpha': 0.8,'marker':'.','color':'steelblue'}


    with plt.style.context(fig_style):
        # Formatting dollar sign labels
        # fmtPrice = '${x:,.0f}'
        # tickPrice = mtick.StrMethodFormatter(fmtPrice)

        ###  PLOTTING ----------------------------- ------------------------ ##

        # Loop through dataframe to plot
        for column in df.describe():

            # Create figure with subplots for current column
            fig, ax = plt.subplots(figsize=figsize, ncols=2, nrows=2)

            ##  SUBPLOT 1 --------------------------------------------------##
            i,j = 0,0
            ax[i,j].set_title(column.capitalize(),fontdict=fontTitle)


            # Plot distplot on ax[i,j] using hist_kws and kde_kws
            sns.distplot(df[column], norm_hist=True, kde=True,
                         hist_kws = hist_kws, kde_kws = kde_kws,
                         label=column+' histogram', ax=ax[i,j])


            # Set x axis label
            ax[i,j].set_xlabel(column.title(),fontdict=fontAxis)

            # Get x-ticks, rotate labels, and return
            xticklab1 = ax[i,j].get_xticklabels(which = 'both')
            ax[i,j].set_xticklabels(labels=xticklab1, fontdict=fontTicks, rotation=0)
            ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter())


            # Set y-label
            ax[i,j].set_ylabel('Density',fontdict=fontAxis)
            yticklab1=ax[i,j].get_yticklabels(which='both')
            ax[i,j].set_yticklabels(labels=yticklab1,fontdict=fontTicks)
            ax[i,j].yaxis.set_major_formatter(mtick.ScalarFormatter())


            # Set y-grid
            ax[i, j].set_axisbelow(True)
            ax[i, j].grid(axis='y',ls='--')




            ##  SUBPLOT 2-------------------------------------------------- ##
            i,j = 0,1
            ax[i,j].set_title(column.capitalize(),fontdict=fontTitle)



            # Plot regplot on ax[i,j] using line_kws and scatter_kws
            sns.regplot(df[column], df[target],
                        line_kws = line_kws,
                        scatter_kws = scatter_kws,
                        ax=ax[i,j])

            # Set x-axis label
            ax[i,j].set_xlabel(column.title(),fontdict=fontAxis)

             # Get x ticks, rotate labels, and return
            xticklab2=ax[i,j].get_xticklabels(which='both')
            ax[i,j].set_xticklabels(labels=xticklab2,fontdict=fontTicks, rotation=0)
            ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter())

            # Set  y-axis label
            ax[i,j].set_ylabel(target.title(),fontdict=fontAxis)

            # Get, set, and format y-axis Price labels
            yticklab = ax[i,j].get_yticklabels()
            ax[i,j].set_yticklabels(yticklab,fontdict=fontTicks)
            ax[i,j].yaxis.set_major_formatter(mtick.ScalarFormatter())

            # Set y-grid
            ax[i, j].set_axisbelow(True)
            ax[i, j].grid(axis='y',ls='--')

            ## ---------- Final layout adjustments ----------- ##
            # Deleted unused subplots
            fig.delaxes(ax[1,1])
            fig.delaxes(ax[1,0])

            # Optimizing spatial layout
            fig.tight_layout()
            # figtitle=column+'_dist_regr_plots.png'
            # plt.savefig(figtitle)
    return fig, ax



## Finding outliers and statistics
# Tukey's method using IQR to eliminate
[docs]def detect_outliers(df, n, features):
    """Uses Tukey's method to return outer of interquartile ranges to return indices if outliers in a dataframe.
    Parameters:
    df (DataFrame): DataFrame containing columns of features
    n: default is 0, multiple outlier cutoff

    Returns:
    Index of outliers for .loc

    Examples:
    Outliers_to_drop = detect_outliers(data,2,["col1","col2"]) Returning value
    df.loc[Outliers_to_drop] # Show the outliers rows
    data= data.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)
"""

# Drop outliers

    outlier_indices = []
    # iterate over features(columns)
    for col in features:

        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)

        # Interquartile range (IQR)
        IQR = Q3 - Q1
        # outlier step
        outlier_step = 1.5 * IQR

        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index

        # append the found outlier indices for col to the list of outlier indices
        outlier_indices.extend(outlier_list_col)

        # select observations containing more than 2 outliers
        from collections import Counter
        outlier_indices = Counter(outlier_indices)
        multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    return multiple_outliers