Source code for cave.analyzer.feature_analysis.feature_analysis

import copy
import logging
import os

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from numpy import corrcoef
from pandas import DataFrame
from scipy.cluster.hierarchy import linkage
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

__author__ = "Marius Lindauer"
__copyright__ = "Copyright 2016, ML4AAD"
__license__ = "MIT"
__email__ = "lindauer@cs.uni-freiburg.de"


[docs]class FeatureAnalysis(object):

    def __init__(self,
                 output_dn: str,
                 scenario,
                 feat_names,
                 feat_importance=None):
        """
        From: https://github.com/mlindauer/asapy


        Arguments
        ---------
        output_dn: str
            output directory name
        scenario: Scenario
            scenario for features
        feat_names: list[str]
            names of features as list
        feat_importance: dict[str] -> float
            maps names to importance
        """
        self.logger = logging.getLogger("Feature Analysis")
        self.scenario = scenario
        self.feat_names = scenario.feature_names
        self.feat_imp = feat_importance
        self.feature_data = {}
        for name in feat_names:
            insts = self.scenario.train_insts
            if not self.scenario.test_insts == [None]:
                insts.extend(self.scenario.test_insts)
            self.feature_data[name] = {}
            for i in insts:
                self.feature_data[name][i] = copy.deepcopy(self.scenario.feature_dict[i][feat_names.index(name)])
        self.feature_data = DataFrame(self.feature_data)

        self.output_dn = os.path.join(output_dn, "feature_plots")
        if not os.path.isdir(self.output_dn):
            os.makedirs(self.output_dn)

[docs]    def get_box_violin_plots(self):
        """
            for each feature generate a plot with box and violin plot

            Returns
            -------
            list of tuples of feature name and feature plot file name
        """
        self.logger.debug("Plotting box and violin plots........")

        files_ = []

        for feat_name in sorted(self.feat_names):
            matplotlib.pyplot.close()
            fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(15, 5))
            vec = self.feature_data[feat_name].values
            vec = vec[~np.isnan(vec)]
            axes[0].violinplot(
                [vec], showmeans=False, showmedians=True, vert=False)
            axes[0].xaxis.grid(True)
            plt.setp(axes[0], yticks=[1], yticklabels=[""])
            axes[1].boxplot(vec, vert=False)
            axes[1].xaxis.grid(True)
            plt.setp(axes[1], yticks=[1], yticklabels=[""])

            plt.tight_layout()

            out_fn = os.path.join(
                self.output_dn, "violin_box_%s_plot.png" % (feat_name.replace("/", "_")))
            plt.savefig(out_fn)
            files_.append((feat_name, out_fn))

        return files_

[docs]    def correlation_plot(self, imp=True):
        """
        generate correlation plot using spearman correlation coefficient and ward clustering

        Returns
        -------
        path: str
            filename of saved plot
        """
        matplotlib.pyplot.close()
        self.logger.debug("Plotting correlation plots........")

        features = self.feat_names
        # Check for important features
        if self.feat_imp and imp:
            imp_features = [f for f in features if f in self.feat_imp]
            if len(imp_features) < 2:
                self.logger.info("Less than two important features -> no correlation plot!")
                return False
        else:
            imp_features = features
        feature_data = copy.deepcopy(self.feature_data[imp_features])
        feature_data = feature_data.fillna(feature_data.mean())
        feature_data = feature_data.values

        n_features = len(imp_features)

        data = np.zeros((n_features, n_features)) + 1  # similarity
        for i in range(n_features):
            for j in range(i + 1, n_features):
                rho = corrcoef([feature_data[:, i], feature_data[:, j]])[0, 1]
                if np.isnan(rho):  # is nan if one feature vec is constant
                    rho = 0
                data[i, j] = rho
                data[j, i] = rho

        link = linkage(data * -1, 'ward')  # input is distance -> * -1

        sorted_features = [[a] for a in imp_features]
        for li in link:
            new_cluster = sorted_features[int(li[0])][:]
            new_cluster.extend(sorted_features[int(li[1])][:])
            sorted_features.append(new_cluster)

        sorted_features = sorted_features[-1]

        # resort data
        indx_list = []
        for f in imp_features:
            indx_list.append(sorted_features.index(f))
        indx_list = np.argsort(indx_list)
        data = data[indx_list, :]
        data = data[:, indx_list]

        fig, ax = plt.subplots()
        heatmap = ax.pcolor(data, cmap=plt.cm.seismic, vmin=-1, vmax=1)

        # put the major ticks at the middle of each cell
        ax.set_xticks(np.arange(data.shape[0]) + 0.5, minor=False)
        ax.set_yticks(np.arange(data.shape[1]) + 0.5, minor=False)

        plt.xlim(0, data.shape[0])
        plt.ylim(0, data.shape[0])

        # want a more natural, table-like display
        ax.invert_yaxis()
        ax.xaxis.tick_top()

        ax.set_xticklabels(sorted_features, minor=False)
        ax.set_yticklabels(sorted_features, minor=False)
        at = 0
        if self.feat_imp and not imp:
            for tx, ty in zip(ax.xaxis.get_ticklabels(), ax.yaxis.get_ticklabels()):
                color_ = (0., 0., 0.)
                if sorted_features[at] in self.feat_imp:
                    color_ = (1., 0., 0.)
                tx.set_color(color_)
                ty.set_color(color_)
                at += 1
        labels = ax.get_xticklabels()
        plt.setp(labels, rotation=45, fontsize=2, ha="left")
        labels = ax.get_yticklabels()
        plt.setp(labels, rotation=0, fontsize=2, ha="right")

        fig.colorbar(heatmap)

        plt.tight_layout()

        if self.feat_imp and imp:
            out_plot = os.path.join(
                self.output_dn, "correlation_plot_features_imp.png")
        else:
            out_plot = os.path.join(
                self.output_dn, "correlation_plot_features.png")
        plt.savefig(out_plot, format="png", dpi=400)

        return out_plot

[docs]    def cluster_instances(self):
        """
        Use pca to reduce feature dimensions to 2 and cluster instances using k-means afterwards
        """
        matplotlib.pyplot.close()
        self.logger.debug("Plotting clusters........")
        # impute missing data; probably already done, but to be on the safe
        # side
        feature_data = self.feature_data.fillna(
            self.feature_data.mean())

        # feature data
        features = feature_data.values

        # scale features
        ss = StandardScaler()
        features = ss.fit_transform(features)

        # feature reduction: pca
        pca = PCA(n_components=2)
        features = pca.fit_transform(features)

        # cluster with k-means
        scores = []
        for n_clusters in range(2, min(features.shape[0], 12)):
            km = KMeans(n_clusters=n_clusters)
            y_pred = km.fit_predict(features)
            score = silhouette_score(features, y_pred)
            scores.append(score)

        best_score = max(scores)
        best_run = scores.index(best_score)
        n_clusters = best_run + 2
        km = KMeans(n_clusters=n_clusters)
        y_pred = km.fit_predict(features)

        plt.figure()
        plt.scatter(features[:, 0], features[:, 1], c=y_pred)
        ax = plt.gca()
        ax.set_ylabel('principal component 1')
        ax.set_xlabel('principal component 2')

        plt.tight_layout()
        out_fn = os.path.join(self.output_dn, "feature_clusters.png")
        plt.savefig(out_fn, format="png")

        return out_fn