Source code for cave.analyzer.parameter_importance.fanova

import operator
import os
from collections import OrderedDict

from pandas import DataFrame

from cave.analyzer.parameter_importance.base_parameter_importance import BaseParameterImportance


[docs]class Fanova(BaseParameterImportance):
    """
    fANOVA (functional analysis of variance) computes the fraction of the variance in the cost space explained by
    changing a parameter by marginalizing over all other parameters, for each parameter (or for pairs of
    parameters). Parameters with high importance scores will have a large impact on the performance.  To this end, a
    random forest is trained as an empirical performance model on the available empirical data from the available
    runhistories.
    """

    def __init__(self,
                 runscontainer,
                 marginal_threshold=0.05):
        """Wrapper for parameter_importance to save the importance-object/ extract the results. We want to show the
        top X most important parameter-fanova-plots.

        Parameters
        ----------
        runscontainer: RunsContainer
            contains all important information about the configurator runs
        marginal_threshold: float
            parameter/s must be at least this important to be mentioned
        """
        super().__init__(runscontainer)

        self.marginal_threshold = marginal_threshold
        self.parameter_importance("fanova")

[docs]    def get_name(self):
        return 'fANOVA'

[docs]    def postprocess(self, pimp, output_dir):
        result = OrderedDict()

        def parse_pairwise(p):
            """parse pimp's way of having pairwise parameters as key as str and return list of individuals"""
            res = [tmp.strip('\' ') for tmp in p.strip('[]').split(',')]
            return res

        parameter_imp = {k: v * 100 for k, v in pimp.evaluator.evaluated_parameter_importance.items()}
        param_imp_std = {}
        if hasattr(pimp.evaluator, 'evaluated_parameter_importance_uncertainty'):
            param_imp_std = {k: v * 100 for k, v in pimp.evaluator.evaluated_parameter_importance_uncertainty.items()}

        for k in parameter_imp.keys():
            self.logger.debug("fanova-importance for %s: mean (over trees): %f, std: %s", k, parameter_imp[k],
                              str(param_imp_std[k]) if param_imp_std else 'N/A')

        # Split single and pairwise (pairwise are string: "['p1','p2']")
        single_imp = {k: v for k, v in parameter_imp.items() if not k.startswith('[') and v > self.marginal_threshold}
        pairwise_imp = {k: v for k, v in parameter_imp.items() if k.startswith('[') and v > self.marginal_threshold}

        # Set internal parameter importance for further analysis (such as parallel coordinates)
        self.fanova_single_importance = single_imp
        self.fanova_pairwise_importance = single_imp

        # Dicts to lists of tuples, sorted descending after importance
        single_imp = OrderedDict(sorted(single_imp.items(), key=operator.itemgetter(1), reverse=True))
        pairwise_imp = OrderedDict(sorted(pairwise_imp.items(), key=operator.itemgetter(1), reverse=True))

        # Create table
        table = []
        if len(single_imp) > 0:
            table.extend([(20*"-"+" Single importance: "+20*"-", 20*"-")])
            for k, v in single_imp.items():
                value = str(round(v, 4))
                if param_imp_std:
                    value += " +/- " + str(round(param_imp_std[k], 4))
                table.append((k, value))
        if len(pairwise_imp) > 0:
            table.extend([(20*"-"+" Pairwise importance: "+20*"-", 20*"-")])
            for k, v in pairwise_imp.items():
                name = ' & '.join(parse_pairwise(k))
                value = str(round(v, 4))
                if param_imp_std:
                    value += " +/- " + str(round(param_imp_std[k], 4))
                table.append((name, value))

        keys, fanova_table = [k[0] for k in table], [k[1:] for k in table]
        df = DataFrame(data=fanova_table, index=keys)
        result['Importance'] = {'table': df.to_html(escape=False, header=False, index=True, justify='left')}

        # Get plot-paths
        result['Marginals'] = {p: {'figure': os.path.join(output_dir, "fanova", p + '.png')} for p in single_imp.keys()}
        # Right now no way to access paths of the plots -> file issue
        pairwise_plots = {" & ".join(parse_pairwise(p)):
                          os.path.join(output_dir, 'fanova', '_'.join(parse_pairwise(p)) + '.png')
                          for p in pairwise_imp.keys()}
        result['Pairwise Marginals'] = {p: {'figure': path}
                                        for p, path in pairwise_plots.items() if os.path.exists(path)}

        return result

[docs]    def get_jupyter(self):
        from IPython.core.display import HTML, Image, display
        for b, result in self.result.items():
            error = self.result[b]['else'] if 'else' in self.result[b] else None
            if error:
                display(HTML(error))
            else:
                # Show table
                display(HTML(self.result[b]["Importance"]["table"]))
                # Show plots
                display(*list([Image(filename=d["figure"]) for d in self.result[b]['Marginals'].values()]))
                display(*list([Image(filename=d["figure"]) for d in self.result[b]['Pairwise Marginals'].values()]))
                # While working for a prettier solution, this might be an option:
                # display(HTML(figure_to_html([d["figure"] for d in self.result[b]['Marginals'].values()] +
                #                            [d["figure"] for d in self.result[b]['Pairwise Marginals'].values()],
                #                            max_in_a_row=3, true_break_between_rows=True)))