Source code for bs_ds.bamboo

# -*- coding: utf-8 -*-
"""Collection of DataFrame inspection, styling, and EDA plotting."""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import scipy.stats as sts
from IPython.display import display

[docs]def big_pandas(user_options=None,verbose=0): """Changes the default pandas display setttings to show all columns and all rows. User may replace settings with a kwd dictionary matching available options. """ import pandas as pd if user_options==None: options = { 'display' : { 'max_columns' : None, 'expand_frame_repr':False, 'max_rows':None, 'max_info_columns':500, 'precision' : 4, } } else: options = user_options for category, option in options.items(): for op, value in option.items(): pd.set_option(f'{category}.{op}', value) # Python 3.6+ if verbose>0: print(f'{category}.{op}={value}') return options
[docs]def reset_pandas(): """Resets all pandas options back to default state.""" return pd.reset_option('all')
[docs]def ignore_warnings(): """Ignores all deprecation warnings (future,and pending categories too).""" import warnings return warnings.filterwarnings('ignore')#simplefilter(action='ignore', category=(FutureWarning,DeprecationWarning,PendingDeprecationWarning))
[docs]def reset_warnings(): """Restore the default warnings settings""" import warnings return warnings.simplefilter(action='default')#, category=(FutureWarning,DeprecationWarning,PendingDeprecationWarning))
# def list2df(list, index_col=None):#, sort_values='index'): # """ Quick turn an appened list with a header (row[0]) into a pretty dataframe. # Ex: list_results = [["Test","N","p-val"]] #... (some sort of analysis performed to produce results) # list_results.append([test_Name,length(data),p]) # list2df(list_results) # """ # # with pd.option_context("display.max_rows", None, "display.max_columns", None , # # 'display.precision',3,'display.notebook_repr_htm',True): # df_list = pd.DataFrame(list[1:],columns=list[0]) # if index_col==None: # return df_list # else: # df_list.reset_index(inplace=True) # df_list.set_index(index_col, inplace=True) # return df_list
[docs]def check_df_for_columns(df, columns=None): """ Checks df for presence of columns. args: ********** df: pd.DataFrame to find columns in columns: str or list of str. column names """ if not columns: print('check_df_for_columns expected to be passed a list of column names.') else: for column in columns: if not column in df.columns: continue else: print(f'{column} is a valid column name') pass
[docs]def check_unique(df, columns=None): """ Prints unique values for all columns in dataframe. If passed list of columns, it will only print results for those columns 8************ > Params: df: pandas DataFrame, or pd.Series columns: list containing names of columns (strings) Returns: None prints values only """ # check for columns # if columns is None: # Check if series, even though this is unnecesary because you could simply # Call pd.series.sort_values() if isinstance(df, pd.Series): # display all the value counts nunique = df.nunique() print(f'\n---------------------------\n') print(f"{df.name} Type: {df.dtype}\nNumber unique values: {nunique}") return pd.DataFrame(df.value_counts()) else: if columns is None: columns = df.columns for col in columns: nunique = df[col].nunique() unique_df = pd.DataFrame(df[col].value_counts()) print(f'\n---------------------------') print(f"\n{col} Type: {df[col].dtype}\nNumber unique values: {nunique}.") display(unique_df) pass
[docs]def check_numeric(df, columns=None, unique_check=False, return_list=False, show_df=False): """ Iterates through columns and checks for possible numeric features labeled as objects. Params: ****************** df: pandas DataFrame unique_check: bool. (default=True) If true, distplays interactive interface for checking unique values in columns. return_list: bool, (default=False) If True, returns a list of column names with possible numeric types. **********> Returns: dataframe displayed (always), list of column names if return_list=True """ from .bs_ds import list2df display_list = [['Column', 'Numeric values','Total Values', 'Percent']] outlist = [] # print(f'\n---------------------------------------------------\n') # print(f'# of Identified Numeric Values in "Object" columns:') # Check for user column list columns_to_check = [] if columns == None: columns_to_check = df.columns else: columns_to_check = columns # Iterate through columns for col in columns_to_check: # Check for object dtype, if df[col].dtype == 'object': # If object, check for numeric if df[col].str.isnumeric().any(): # If numeric, get counts vals = df[col].str.isnumeric().sum() percent = round((df[col].str.isnumeric().sum()/len(df[col]))*100, 2) display_list.append([col, vals,len(df[col]), percent]) outlist.append(col) list2show = list2df(display_list) list2show.set_index('Column',inplace=True) styled_list2show = list2show.style.set_caption('# of Detected Numeric Values in "Object" columns:') if show_df==True: display(styled_list2show) if unique_check: unique = input("display unique values? (Enter 'y' for all columns, a column name, or 'n' to quit):") while unique != 'n': if unique == 'y': check_unique(df, outlist) break elif unique in outlist: name = [unique] check_unique(df, name) unique = input('Enter column name or n to quit:') if return_list==True: return styled_list2show, outlist else: return styled_list2show
[docs]def check_null(df, columns=None,show_df=False): """ Iterates through columns and checks for null values and displays # and % of column. Params: ****************** df: pandas DataFrame columns: list of columns to check **********> Returns: displayed dataframe """ from .bs_ds import list2df display_list = [['Column', 'Null values', 'Total Values','Percent']] outlist = [] # print(f'\n----------------------------\n') # print(f'# of Identified Null Values:') # Check for user column list columns_to_check = [] if columns==None: columns_to_check = df.columns else: columns_to_check = columns # Iterate through columns for col in columns_to_check: # Check for object dtype, # if df[col].dtype == 'object': # If object, check for numeric # If numeric, get counts vals = df[col].isna().sum() percent = round((vals/len(df[col]))*100, 3) display_list.append([col, vals, len(df[col]), percent]) outlist.append(col) list2show=list2df(display_list) list2show.set_index('Column',inplace=True) styled_list2show = list2show.style.set_caption('# of Identified Null Values:') if show_df==True: display(styled_list2show) return styled_list2show
[docs]def compare_duplicates(df1, df2, to_drop=True, verbose=True, return_names_list=False): """ Compare two dfs for duplicate columns, drop if to_drop=True, useful to us before concatenating when dtypes are different between matching column names and df.drop_duplicates is not an option. Params: -------------------- df1, df2 : pandas dataframe suspected of having matching columns to_drop : bool, (default=True) If True will give the option of dropping columns one at a time from either column. verbose: bool (default=True) If True prints column names and types, set to false and return_names list=True if only desire a list of column names and no interactive interface. return_names_list: bool (default=False), If True, will return a list of all duplicate column names. -------------------- Returns: List of column names if return_names_list=True, else nothing. """ catch = [] dropped1 = [] dropped2 = [] if verbose: print("Column | df1 | df2 ") print("*----------------------*") # Loop through columns, inspect for duplicates for col in df1.columns: if col in df2.columns: catch.append(col) if verbose: print(f"{col} {df1[col].dtype} {df2[col].dtype}") # Accept user input and drop columns one by one if to_drop: choice = input("\nDrop this column? Enter 1. df1, 2. df2 or n for neither") if choice == "1": df1.drop(columns=col, axis=1, inplace=True) dropped1.append(col) elif choice == "2": df2.drop(columns=col, axis=1, inplace=True) dropped2.append(col) else: continue # Display dropped columns and orignating df if to_drop: if len(dropped1) >= 1: print(f"\nDropped from df1:\n{dropped1}") if len(dropped2) >= 1: print(f"\nDropped from df1:\n{dropped2}") if return_names_list: return catch else: pass
## Dataframes styling
[docs]def check_column(panda_obj, columns=None,nlargest='all'): """ Prints column name, dataype, # and % of null values, and unique values for the nlargest # of rows (by valuecount_. it will only print results for those columns ************ Params: panda_object: pandas DataFrame or Series columns: list containing names of columns (strings) Returns: None prints values only """ # Check for DF vs Series if type(panda_obj)==pd.core.series.Series: series=panda_obj print(f'\n----------------------------\n') print(f"Column: df['{series.name}']':") print(f"dtype: {series.dtype}") print(f"isna: {series.isna().sum()} out of {len(series)} - {round(series.isna().sum()/len(series)*100,3)}%") print(f'\nUnique non-na values:') if nlargest =='all': print(series.value_counts()) else: print(series.value_counts().nlargest(nlargest)) elif type(panda_obj)==pd.core.frame.DataFrame: df = panda_obj for col_name in df.columns: col = df[col_name] print("\n-----------------------------------------------") print(f"Column: df['{col_name}']':") print(f"dtype: {col.dtypes}") print(f"isna: {col.isna().sum()} out of {len(col)} - {round(col.isna().sum()/len(col)*100,3)}%") print(f'\nUnique non-na values:\nnlargest={nlargest}\n-----------------') if nlargest =='all': print(col.value_counts()) else: print(col.value_counts().nlargest(nlargest))
## DataFrame Creation, Inspection, and Exporting
[docs]def inspect_df(df, n_rows=3, verbose=True): """ EDA: Show all pandas inspection tables. Displays df.head(), df.info(), df.describe(). By default also runs check_null and check_numeric to inspect columns for null values and to check string columns to detect numeric values. (If verbose==True) Parameters: df(dataframe): dataframe to inspect n_rows: number of header rows to show (Default=3). verbose: If verbose==True (default), check_null and check_numeric. Ex: inspect_df(df,n_rows=4) """ from bs_ds.bamboo import check_column, check_null, check_numeric, check_unique from bs_ds.prettypandas import display_side_by_side import pandas as pd with pd.option_context("display.max_columns", None ,'display.precision',4): display(df.info()) #, display(df.describe()) if verbose == True: df_num = check_numeric(df,unique_check=False, show_df=False) # sdf_num = df_num.style.set_caption('Detected Numeric Values') df_null = check_null(df, show_df=False) # sdf_null = df_null.style.set_caption('Detected Null values') display_side_by_side(df_null, df_num,df.describe()) else: display(df.describe()) display(df.head(n_rows))
[docs]def drop_cols(df, list_of_strings_or_regexp,verbose=0):#,axis=1): """EDA: Take a df, a list of strings or regular expression and recursively removes all matching column names containing those strings or expressions. # Example: if the df_in columns are ['price','sqft','sqft_living','sqft15','sqft_living15','floors','bedrooms'] df_out = drop_cols(df_in, ['sqft','bedroom']) df_out.columns # will output: ['price','floors'] Parameters: DF -- Input dataframe to remove columns from. regex_list -- list of string patterns or regexp to remove. Returns: df_dropped -- input df without the dropped columns. """ regex_list=list_of_strings_or_regexp df_cut = df.copy() for r in regex_list: df_cut = df_cut[df_cut.columns.drop(list(df_cut.filter(regex=r)))] if verbose>0: print(f'Removed {r}.') df_dropped = df_cut return df_dropped
[docs]def add_filtered_col_to_df(df_source, df_to_add_to, list_of_exps, return_filtered_col_names =False): """Takes a dataframe source with columns to copy using df.filter(regexp=(list_of_exps)), with list_of_exps being a list of text expressions to find inside column names.""" import bs_ds as bs import pandas as pd filtered_col_list = {} for exp in list_of_exps: df_temp_filtered = df_source.filter(regex=(exp),axis=1).copy() filtered_col_list[exp]= list(df_temp_filtered.columns) df_to_add_to = pd.concat([df_to_add_to, df_temp_filtered]) if return_filtered_col_names == False: return df_to_add_to else: print(filtered_col_list) return df_to_add_to, filtered_col_list
## # EDA / Plotting Functions
[docs]def multiplot(df,annot=True,fig_size=None): """EDA: Plots results from df.corr() in a correlation heat map for multicollinearity. Returns fig, ax objects""" import seaborn as sns sns.set(style="white") from string import ascii_letters import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt # Compute the correlation matrix corr = df.corr() # Generate a mask for the upper triangle mask = np.zeros_like(corr, dtype=np.bool) idx = np.triu_indices_from(mask) mask[idx] = True # Set up the matplotlib figure if fig_size==None: figsize=(16,16) else: figsize = fig_size f, ax = plt.subplots(figsize=(figsize)) # Generate a custom diverging colormap cmap = sns.diverging_palette(220, 10, as_cmap=True) # Draw the heatmap with the mask and correct aspect ratio sns.heatmap(corr, mask=mask, annot=annot, cmap=cmap, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5}) return f, ax
# Plots histogram and scatter (vs price) side by side
[docs]def plot_hist_scat(df, target=None, figsize=(12,9),fig_style='dark_background',font_dict=None,plot_kwds=None): """EDA: Great summary plots of all columns of a df vs target columne. Shows distplots and regplots for columns im datamframe vs target. Parameters: df (DataFrame): DataFrame.describe() columns will be plotted. target (string): Name of column containing target variable.assume first column. figsize (tuple): Tuple for figsize. Default=(12,9). fig_style: Figure style to use (in this context, will not change others in notebook). Default is 'dark_background'. font_dict: A keywork dictionry containing values for font properties under the following keys: - "fontTitle": font dictioanry for titles , fontAxis, fontTicks **plot_kwds: A kew_word dictionary containing any of the following keys for dictionaries containing any valid matplotlib key:value pairs for plotting: "hist_kws, kde_kws, line_kws,scatter_kws" Accepts any valid matplotlib key:value pairs passed by searborn to matplotlib. Subplot 1: hist_kws, kde_kws Subplot 2: line_kws,scatter_kws Returns: fig: Figure object. ax: Subplot axes with format ax[row,col]. Subplot 1 = ax[0,0]; Subplot 2 = ax[0,1] """ import matplotlib.ticker as mtick import matplotlib.pyplot as plt import seaborn as sns # Set target as first column if not specified if target==None: target= df.iloc[:,0] ### DEFINE AESTHETIC CUSTOMIZATIONS -------------------------------## # Checking for user font_dict, if not setting defaults: if font_dict == None: # Axis Label fonts fontTitle = {'fontsize': 16, 'fontweight': 'bold', 'fontfamily':'serif'} fontAxis = {'fontsize': 14, 'fontweight': 'bold', 'fontfamily':'serif'} fontTicks = {'fontsize': 12, 'fontweight':'bold', 'fontfamily':'serif'} else: if 'fontTitle' in font_dict.keys(): fontTitle = font_dict['fontTitle'] else: fontTitle = {'fontsize': 16, 'fontweight': 'bold','fontfamily':'serif'} if 'fontAxis' in font_dict.keys(): fontAxis = font_dict['fontAxis'] else: fontAxis = {'fontsize': 14,'fontweight': 'bold', 'fontfamily':'serif'} if 'fontTicks' in font_dict.keys(): fontTicks = font_dict['fontTicks'] else: fontTicks = {'fontsize': 12,'fontweight':'bold','fontfamily':'serif'} # Checking for user plot_kwds if plot_kwds == None: hist_kws = {"linewidth": 1, "alpha": 1, "color": 'steelblue','edgecolor':'w','hatch':'\\'} kde_kws = {"color": "white", "linewidth": 3, "label": "KDE",'alpha':0.7} line_kws={"color":"white","alpha":0.5,"lw":3,"ls":":"} scatter_kws={'s': 2, 'alpha': 0.8,'marker':'.','color':'steelblue'} else: kwds = plot_kwds # Define graphing keyword dictionaries for distplot (Subplot 1) if 'hist_kws' in kwds.keys(): hist_kws = kwds['hist_kws'] else: hist_kws = {"linewidth": 1, "alpha": 1, "color": 'steelblue','edgecolor':'w','hatch':'\\'} if 'kde_kws' in kwds.keys(): kde_kws = kwds['kde_kws'] else: kde_kws = {"color": "white", "linewidth": 3, "label": "KDE",'alpha':0.7} # Define the kwd dictionaries for scatter and regression line (subplot 2) if 'line_kws' in kwds.keys(): line_kws = kwds['line_kws'] else: line_kws={"color":"white","alpha":0.5,"lw":3,"ls":":"} if 'scatter_kws' in kwds.keys(): scatter_kws = kwds['scatter_kws'] else: scatter_kws={'s': 2, 'alpha': 0.8,'marker':'.','color':'steelblue'} with plt.style.context(fig_style): # Formatting dollar sign labels # fmtPrice = '${x:,.0f}' # tickPrice = mtick.StrMethodFormatter(fmtPrice) ### PLOTTING ----------------------------- ------------------------ ## # Loop through dataframe to plot for column in df.describe(): # Create figure with subplots for current column fig, ax = plt.subplots(figsize=figsize, ncols=2, nrows=2) ## SUBPLOT 1 --------------------------------------------------## i,j = 0,0 ax[i,j].set_title(column.capitalize(),fontdict=fontTitle) # Plot distplot on ax[i,j] using hist_kws and kde_kws sns.distplot(df[column], norm_hist=True, kde=True, hist_kws = hist_kws, kde_kws = kde_kws, label=column+' histogram', ax=ax[i,j]) # Set x axis label ax[i,j].set_xlabel(column.title(),fontdict=fontAxis) # Get x-ticks, rotate labels, and return xticklab1 = ax[i,j].get_xticklabels(which = 'both') ax[i,j].set_xticklabels(labels=xticklab1, fontdict=fontTicks, rotation=0) ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter()) # Set y-label ax[i,j].set_ylabel('Density',fontdict=fontAxis) yticklab1=ax[i,j].get_yticklabels(which='both') ax[i,j].set_yticklabels(labels=yticklab1,fontdict=fontTicks) ax[i,j].yaxis.set_major_formatter(mtick.ScalarFormatter()) # Set y-grid ax[i, j].set_axisbelow(True) ax[i, j].grid(axis='y',ls='--') ## SUBPLOT 2-------------------------------------------------- ## i,j = 0,1 ax[i,j].set_title(column.capitalize(),fontdict=fontTitle) # Plot regplot on ax[i,j] using line_kws and scatter_kws sns.regplot(df[column], df[target], line_kws = line_kws, scatter_kws = scatter_kws, ax=ax[i,j]) # Set x-axis label ax[i,j].set_xlabel(column.title(),fontdict=fontAxis) # Get x ticks, rotate labels, and return xticklab2=ax[i,j].get_xticklabels(which='both') ax[i,j].set_xticklabels(labels=xticklab2,fontdict=fontTicks, rotation=0) ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter()) # Set y-axis label ax[i,j].set_ylabel(target.title(),fontdict=fontAxis) # Get, set, and format y-axis Price labels yticklab = ax[i,j].get_yticklabels() ax[i,j].set_yticklabels(yticklab,fontdict=fontTicks) ax[i,j].yaxis.set_major_formatter(mtick.ScalarFormatter()) # Set y-grid ax[i, j].set_axisbelow(True) ax[i, j].grid(axis='y',ls='--') ## ---------- Final layout adjustments ----------- ## # Deleted unused subplots fig.delaxes(ax[1,1]) fig.delaxes(ax[1,0]) # Optimizing spatial layout fig.tight_layout() # figtitle=column+'_dist_regr_plots.png' # plt.savefig(figtitle) return fig, ax
## Finding outliers and statistics # Tukey's method using IQR to eliminate
[docs]def detect_outliers(df, n, features): """Uses Tukey's method to return outer of interquartile ranges to return indices if outliers in a dataframe. Parameters: df (DataFrame): DataFrame containing columns of features n: default is 0, multiple outlier cutoff Returns: Index of outliers for .loc Examples: Outliers_to_drop = detect_outliers(data,2,["col1","col2"]) Returning value df.loc[Outliers_to_drop] # Show the outliers rows data= data.drop(Outliers_to_drop, axis = 0).reset_index(drop=True) """ # Drop outliers outlier_indices = [] # iterate over features(columns) for col in features: # 1st quartile (25%) Q1 = np.percentile(df[col], 25) # 3rd quartile (75%) Q3 = np.percentile(df[col],75) # Interquartile range (IQR) IQR = Q3 - Q1 # outlier step outlier_step = 1.5 * IQR # Determine a list of indices of outliers for feature col outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index # append the found outlier indices for col to the list of outlier indices outlier_indices.extend(outlier_list_col) # select observations containing more than 2 outliers from collections import Counter outlier_indices = Counter(outlier_indices) multiple_outliers = list( k for k, v in outlier_indices.items() if v > n ) return multiple_outliers