Source code for bs_ds.glassboxes

# -*- coding: utf-8 -*-

""" A collection of modified tools to visualize the inner-workings of model objects, especially Catboot Models."""
# from sklearn.tree import export_graphviz

[docs]def make_activations_model(model,idx_layers_to_show=None, verbose=True):
    """Accepts a Keras image convolution model and exports a new model,
    with just the intermediate activations to plot with plot_activations()."""
    import keras
    import matplotlib.pyplot as plt
    from keras import models
    import numpy as np

    # If no image layer index provided, get all Conv2D and MaxPooling2D layers
    if idx_layers_to_show == None:
        layers_to_show = []

        # Check all layers for appropriate types
        for l,layer in enumerate(model.layers):

            check_type = type(layer)
            if check_type in [keras.layers.convolutional.Conv2D, keras.layers.pooling.MaxPooling2D]:
                layers_to_show.append(layer)

        # Create layer_output s
        layer_outputs = [layer.output for layer in layers_to_show]

    else:
        check_dims = np.shape(idx_layers_to_show)

        # Check if 2 index numbers provided
        if check_dims == 2:
            idx_start = idx_layers_to_show[0]
            idx_end = idx_layers_to_show[1]

            layer_outputs = [layer.output for layer in model.layers[idx_start:idx_end]]# exclude the flatten and dense layers

        elif check_dims == 1:

            layer_outputs = [layer.output for layer in model.layers[idx_layers_to_show]]# exclude the flatten and dense layers

    # Now that we have layer_outputs, lets creat ethe activaiton_model
    activation_model = models.Model(inputs=model.input, outputs=layer_outputs)
    if verbose==True:
        print(activation_model.summary())

    return activation_model

[docs]def plot_activations(activations_model, img_tensor, n_cols=16,process=True,colormap='viridis'):
    """Accepts an activations_model from make_activations_model. Plots all channels'
    outputs for every image layer in the model."""
    import math
    import matplotlib.pyplot as plt
    import numpy as np
    # Genearate activations from model
    activations = activations_model.predict(img_tensor)

    # Extract layer names for labels
    layer_names = []
#     for layer in model.layers[:8]:
    for layer in activations_model.layers:
        layer_names.append(layer.name)

    # Calculate the number of rows and columns for the figure
    total_features = sum([a.shape[-1] for a in activations]) # shape[-1] ==number of outputs
    n_rows = math.ceil(total_features / n_cols)

    # creat the figure and plots
    fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(n_cols,n_rows*1.2) )

    iteration = 0
    for layer_n, layer_activation in enumerate(activations):
        n_channels = layer_activation.shape[-1]

        for ch_idx in range(n_channels):
            row = iteration // n_cols
            column = iteration % n_cols

            ax = axes[row, column]

            channel_image = layer_activation[0,:,:,ch_idx]

            if process==True:
                """create a z-score of the image"""
                channel_image -= channel_image.mean()
                channel_image /= channel_image.std()

                channel_image *= 64
                channel_image += 128

            channel_image = np.clip(channel_image, 0, 255).astype('uint8')

            ax.imshow(channel_image, aspect='auto',cmap=colormap)

            # Remove x and y ticks
            ax.get_xaxis().set_ticks([])
            ax.get_yaxis().set_ticks([])

            # Add labels for first channel in layer
            if ch_idx == 0:
                ax.set_title(layer_names[layer_n],fontsize=10)
            iteration +=1

    # After all channels in a layer are finished:
    fig.subplots_adjust(hspace=1.25)
    plt.show()

    return fig, axes


[docs]def plot_auc_roc_curve(y_test, y_test_pred):
    """ Takes y_test and y_test_pred from a ML model and uses sklearn roc_curve to plot the AUC-ROC curve."""
    from sklearn.metrics import roc_curve, auc, roc_auc_score
    import matplotlib.pyplot as plt
    auc = roc_auc_score(y_test, y_test_pred[:,1])

    FPr, TPr, _  = roc_curve(y_test, y_test_pred[:,1])
    auc()
    plt.plot(FPr, TPr,label=f"AUC for Classifier:\n{round(auc,2)}" )

    plt.plot([0, 1], [0, 1],  lw=2,linestyle='--')
    plt.xlim([-0.01, 1.0])
    plt.ylim([0.0, 1.05])

    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()


[docs]def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=None,
                          print_matrix=True):
    """Check if Normalization Option is Set to True. If so, normalize the raw confusion matrix before visualizing
    #Other code should be equivalent to your previous function."""
    import itertools
    import numpy as np
    import matplotlib.pyplot as plt
    if cmap==None:
        cmap = plt.get_cmap("Blues")

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()


# Display graphviz tree
[docs]def viz_tree(tree_object):
    '''Takes a Sklearn Decision Tree and returns a png image using graph_viz and pydotplus.'''
    # Visualize the decision tree using graph viz library
    from sklearn.externals.six import StringIO
    from IPython.display import Image
    from sklearn.tree import export_graphviz
    import pydotplus
    dot_data = StringIO()
    export_graphviz(tree_object, out_file=dot_data, filled=True, rounded=True,special_characters=True)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    tree_viz = Image(graph.create_png())
    return tree_viz

[docs]def plot_cat_feature_importances(cb_clf):
    """Accepts a fitted CatBoost classifier model and plots the feature importances as a bar chart.
    Returns the results as a Series."""
    # Plotting Feature Importances
    import pandas as pd
    important_feature_names = cb_clf.feature_names_
    important_feature_scores = cb_clf.feature_importances_

    important_features = pd.Series(important_feature_scores, index = important_feature_names)
    important_features.sort_values().plot(kind='barh')
    return important_features


[docs]class Clock(object):
    """A clock meant to be used as a timer for functions using local time.
    Clock.tic() starts the timer, .lap() adds the current laps time to clock._list_lap_times, .toc() stops the timer.
    If user initiializes with verbose =0, only start and final end times are displays.
        If verbose=1, print each lap's info at the end of each lap.
        If verbose=2 (default, display instruction line, return datafarme of results.)
    """

    from datetime import datetime
    from pytz import timezone
    from tzlocal import get_localzone
    from bs_ds import list2df
    # from bs_ds import list2df

[docs]    def get_time(self,local=True):
        """Returns current time, in local time zone by default (local=True)."""
        from datetime import datetime
        from pytz import timezone
        from tzlocal import get_localzone

        _now_utc_=datetime.now(timezone('UTC'))
        _now_local_=_now_utc_.astimezone(self._timezone_)
        if local==True:
            time_now = _now_local_

            return time_now#_now_local_
        else:
            return _now_utc_


    def __init__(self, display_final_time_as_minutes=True, verbose=2):

        from datetime import datetime
        from pytz import timezone
        from tzlocal import get_localzone

        self._strformat_ = []
        self._timezone_ = []
        self._timezone_ = get_localzone()
        self._start_time_ = []
        self._lap_label_ = []
        self._lap_end_time_ = []
        self._verbose_ = verbose
        self._lap_duration_ = []
        self._verbose_ = verbose
        self._prior_start_time_ = []
        self._display_as_minutes_ = display_final_time_as_minutes

        strformat = "%m/%d/%y - %I:%M:%S %p"
        self._strformat_ = strformat

[docs]    def mark_lap_list(self, label=None):
        """Used internally, appends the current laps' information when called by .lap()
        self._lap_times_list_ = [['Lap #' , 'Start Time','Stop Time', 'Stop Label', 'Duration']]"""
        import bs_ds as bs
#         print(self._prior_start_time_, self._lap_end_time_)

        if label is None:
            label='--'

        duration = self._lap_duration_.total_seconds()
        self._lap_times_list_.append([ self._lap_counter_ , # Lap #
                                      (self._prior_start_time_).strftime(self._strformat_), # This Lap's Start Time
                                      self._lap_end_time_,#.strftime(self._strformat_), # stop clock time
                                      label,#self._lap_label_, # The Label passed with .lap()
                                      f'{duration:.3f} sec']) # the lap duration


[docs]    def tic(self, label=None ):
        "Start the timer and display current time, appends label to the _list_lap_times."
        from datetime import datetime
        from pytz import timezone

        self._start_time_ = self.get_time()
        self._start_label_ = label
        self._lap_counter_ = 0
        self._prior_start_time_=self._start_time_
        self._lap_times_list_=[]

        # Initiate lap counter and list
        self._lap_times_list_ = [['Lap #','Start Time','Stop Time', 'Label', 'Duration']]
        self._lap_counter_ = 0
        self._decorate_ = '--- '
        decorate=self._decorate_
        base_msg = f'{decorate}CLOCK STARTED @: {self._start_time_.strftime(self._strformat_):>{25}}'

        if label == None:
            display_msg = base_msg+' '+ decorate
            label='--'
        else:
            spacer = ' '
            display_msg = base_msg+f'{spacer:{10}} Label: {label:{10}} {decorate}'
        if self._verbose_>0:
            print(display_msg)#f'---- Clock started @: {self._start_time_.strftime(self._strformat_):>{25}} {spacer:{10}} label: {label:{20}}  ----')

[docs]    def toc(self,label=None, summary=True):
        """Stop the timer and displays results, appends label to final _list_lap_times entry"""
        if label == None:
            label='--'
        from datetime import datetime
        from pytz import timezone
        from tzlocal import get_localzone
        from bs_ds import list2df
        if label is None:
            label='--'

        _final_end_time_ = self.get_time()
        _total_time_ = _final_end_time_ - self._start_time_
        _end_label_ = label

        self._lap_counter_+=1
        self._final_end_time_ = _final_end_time_
        self._lap_label_=_end_label_
        self._lap_end_time_ = _final_end_time_.strftime(self._strformat_)
        self._lap_duration_ = _final_end_time_ - self._prior_start_time_
        self._total_time_ = _total_time_

        decorate=self._decorate_
        # Append Summary Line
        if self._display_as_minutes_ == True:
            total_seconds = self._total_time_.total_seconds()
            total_mins = int(total_seconds // 60)
            sec_remain = total_seconds % 60
            total_time_to_display = f'{total_mins} min, {sec_remain:.3f} sec'
        else:

            total_seconds = self._total_time_.total_seconds()
            sec_remain = round(total_seconds % 60,3)

            total_time_to_display = f'{sec_remain} sec'
        self._lap_times_list_.append(['TOTAL',
                                      self._start_time_.strftime(self._strformat_),
                                      self._final_end_time_.strftime(self._strformat_),
                                      label,
                                      total_time_to_display]) #'Total Time: ', total_time_to_display])

        if self._verbose_>0:
            print(f'--- TOTAL DURATION   =  {total_time_to_display:>{15}} {decorate}')

        if summary:
            self.summary()

[docs]    def lap(self, label=None):
        """Records time, duration, and label for current lap. Output display varies with clock verbose level.
        Calls .mark_lap_list() to document results in clock._list_lap_ times."""
        from datetime import datetime
        if label is None:
            label='--'
        _end_time_ = self.get_time()

        # Append the lap attribute list and counter
        self._lap_label_ = label
        self._lap_end_time_ = _end_time_.strftime(self._strformat_)
        self._lap_counter_+=1
        self._lap_duration_ = (_end_time_ - self._prior_start_time_)
        # Now update the record
        self.mark_lap_list(label=label)

        # Now set next lap's new _prior_start
        self._prior_start_time_=_end_time_
        spacer = ' '

        if self._verbose_>0:
            print(f'       - Lap # {self._lap_counter_} @:  \
            {self._lap_end_time_:>{25}} {spacer:{5}} Dur: {self._lap_duration_.total_seconds():.3f} sec.\
            {spacer:{5}}Label:  {self._lap_label_:{20}}')

[docs]    def summary(self):
        """Display dataframe summary table of Clock laps"""
        from bs_ds import list2df
        import pandas as pd
        from IPython.display import display
        df_lap_times = list2df(self._lap_times_list_)#,index_col='Lap #')
        df_lap_times.drop('Stop Time',axis=1,inplace=True)
        df_lap_times = df_lap_times[['Lap #','Start Time','Duration','Label']]
        dfs = df_lap_times.style.hide_index().set_caption('Summary Table of Clocked Processes').set_properties(subset=['Start Time','Duration'],**{'width':'140px'})
        display(dfs.set_table_styles([dict(selector='table, th', props=[('text-align', 'center')])]))