Skip to content

Metafeatures

A MetaFeature is some statistic about a dataset/task, that can be used to make datasets or tasks more comparable, thus enabling meta-learning methods.

Calculating meta-features of a dataset is quite straight foward.

Metafeatures
import openml
from amltk.metalearning import compute_metafeatures

dataset = openml.datasets.get_dataset(
    31,  # credit-g
    download_data=True,
    download_features_meta_data=False,
    download_qualities=False,
)
X, y, _, _ = dataset.get_data(
    dataset_format="dataframe",
    target=dataset.default_target_attribute,
)

mfs = compute_metafeatures(X, y)

print(mfs)
instance_count                                           1000.000000
log_instance_count                                          6.907755
number_of_classes                                           2.000000
number_of_features                                         20.000000
log_number_of_features                                      2.995732
percentage_missing_values                                   0.000000
percentage_of_instances_with_missing_values                 0.000000
percentage_of_features_with_missing_values                  0.000000
percentage_of_categorical_columns_with_missing_values       0.000000
percentage_of_categorical_values_with_missing_values        0.000000
percentage_of_numeric_columns_with_missing_values           0.000000
percentage_of_numeric_values_with_missing_values            0.000000
number_of_numeric_features                                  7.000000
number_of_categorical_features                             13.000000
ratio_numerical_features                                    0.350000
ratio_categorical_features                                  0.650000
ratio_features_to_instances                                 0.020000
minority_class_imbalance                                    0.200000
majority_class_imbalance                                    0.200000
class_imbalance                                             0.400000
mean_categorical_imbalance                                  0.500500
std_categorical_imbalance                                   0.234994
skewness_mean                                               0.920379
skewness_std                                                0.904952
skewness_min                                               -0.531348
skewness_max                                                1.949628
kurtosis_mean                                               0.924278
kurtosis_std                                                1.785467
kurtosis_min                                               -1.381449
kurtosis_max                                                4.292590
dtype: float64

By default compute_metafeatures() will calculate all the MetaFeature implemented, iterating through their subclasses to do so. You can pass an explicit list as well to compute_metafeatures(X, y, features=[...]).

To implement your own is also quite straight forward:

Create Metafeature
from amltk.metalearning import MetaFeature, compute_metafeatures
import openml

dataset = openml.datasets.get_dataset(
    31,  # credit-g
    download_data=True,
    download_features_meta_data=False,
    download_qualities=False,
)
X, y, _, _ = dataset.get_data(
    dataset_format="dataframe",
    target=dataset.default_target_attribute,
)

class TotalValues(MetaFeature):

    @classmethod
    def compute(
        cls,
        x: pd.DataFrame,
        y: pd.Series | pd.DataFrame,
        dependancy_values: dict,
    ) -> int:
        return int(x.shape[0] * x.shape[1])

mfs = compute_metafeatures(X, y, features=[TotalValues])
print(mfs)
total_values    20000
dtype: int64

As many metafeatures rely on pre-computed dataset statistics, and they do not need to be calculated more than once, you can specify the dependancies of a meta feature. When a metafeature would return something other than a single value, i.e. a dict or a pd.DataFrame, we instead call those a DatasetStatistic. These will not be included in the result of compute_metafeatures(). These DatasetStatistics will only be calculated once on a call to compute_metafeatures() so they can be re-used across all MetaFeatures that require that dependancy.

Metafeature Dependancy
from amltk.metalearning import MetaFeature, DatasetStatistic, compute_metafeatures
import openml

dataset = openml.datasets.get_dataset(
    31,  # credit-g
    download_data=True,
    download_features_meta_data=False,
    download_qualities=False,
)
X, y, _, _ = dataset.get_data(
    dataset_format="dataframe",
    target=dataset.default_target_attribute,
)

class NAValues(DatasetStatistic):
    """A mask of all NA values in a dataset"""

    @classmethod
    def compute(
        cls,
        x: pd.DataFrame,
        y: pd.Series | pd.DataFrame,
        dependancy_values: dict,
    ) -> pd.DataFrame:
        return x.isna()


class PercentageNA(MetaFeature):
    """The percentage of values missing"""

    dependencies = (NAValues,)

    @classmethod
    def compute(
        cls,
        x: pd.DataFrame,
        y: pd.Series | pd.DataFrame,
        dependancy_values: dict,
    ) -> int:
        na_values = dependancy_values[NAValues]
        n_na = na_values.sum().sum()
        n_values = int(x.shape[0] * x.shape[1])
        return float(n_na / n_values)

mfs = compute_metafeatures(X, y, features=[PercentageNA])
print(mfs)
percentage_n_a    0.0
dtype: float64

To view the description of a particular MetaFeature, you can call .description() on it. Otherwise you can access all of them in the following way:

Metafeature Descriptions
from pprint import pprint
from amltk.metalearning import metafeature_descriptions

descriptions = metafeature_descriptions()
for name, description in descriptions.items():
    print("---")
    print(name)
    print("---")
    print(" * " + description)
---
instance_count
---
 * Number of instances in the dataset.
---
log_instance_count
---
 * Logarithm of the number of instances in the dataset.
---
number_of_classes
---
 * Number of classes in the dataset.
---
number_of_features
---
 * Number of features in the dataset.
---
log_number_of_features
---
 * Logarithm of the number of features in the dataset.
---
percentage_missing_values
---
 * Percentage of missing values in the dataset.
---
percentage_of_instances_with_missing_values
---
 * Percentage of instances with missing values.
---
percentage_of_features_with_missing_values
---
 * Percentage of features with missing values.
---
percentage_of_categorical_columns_with_missing_values
---
 * Percentage of categorical columns with missing values.
---
percentage_of_categorical_values_with_missing_values
---
 * Percentage of categorical values with missing values.
---
percentage_of_numeric_columns_with_missing_values
---
 * Percentage of numeric columns with missing values.
---
percentage_of_numeric_values_with_missing_values
---
 * Percentage of numeric values with missing values.
---
number_of_numeric_features
---
 * Number of numeric features in the dataset.
---
number_of_categorical_features
---
 * Number of categorical features in the dataset.
---
ratio_numerical_features
---
 * Ratio of numerical features to total features in the dataset.
---
ratio_categorical_features
---
 * Ratio of categoricals features to total features in the dataset.
---
ratio_features_to_instances
---
 * Ratio of features to instances in the dataset.
---
minority_class_imbalance
---
 * Imbalance of the minority class in the dataset. 0 => Balanced. 1 imbalanced.
---
majority_class_imbalance
---
 * Imbalance of the majority class in the dataset. 0 => Balanced. 1 imbalanced.
---
class_imbalance
---
 * Mean Target Imbalance of the classes in general.

    0 => Balanced. 1 Imbalanced.

---
mean_categorical_imbalance
---
 * The mean imbalance of categorical features.
---
std_categorical_imbalance
---
 * The std imbalance of categorical features.
---
skewness_mean
---
 * The mean skewness of numerical features.
---
skewness_std
---
 * The std skewness of numerical features.
---
skewness_min
---
 * The min skewness of numerical features.
---
skewness_max
---
 * The max skewness of numerical features.
---
kurtosis_mean
---
 * The mean kurtosis of numerical features.
---
kurtosis_std
---
 * The std kurtosis of numerical features.
---
kurtosis_min
---
 * The min kurtosis of numerical features.
---
kurtosis_max
---
 * The max kurtosis of numerical features.
---
total_values
---
 * 
---
percentage_n_a
---
 * The percentage of values missing

class DatasetStatistic #

Bases: ABC, Generic[S]

Base class for a dataset statistic.

A dataset statistic is a function that takes a dataset and returns some value(s) that describe the dataset.

If looking to create meta-features, see the MetaFeature class which restricts the statistic to be a single number.

def description()
classmethod
#

Return the description of this statistic.

Source code in src/amltk/metalearning/metafeatures.py
@classmethod
def description(cls) -> str:
    """Return the description of this statistic."""
    return cls.__doc__ or ""

def name()
classmethod
#

Return the name of this statistic.

Source code in src/amltk/metalearning/metafeatures.py
@classmethod
def name(cls) -> str:
    """Return the name of this statistic."""
    return CAMEL_CASE_PATTERN.sub("_", cls.__name__).lower()

def compute(x, y, dependancy_values)
abstractmethod classmethod
#

Compute the value of this statistic.

PARAMETER DESCRIPTION
x

The features of the dataset.

TYPE: DataFrame

y

The labels of the dataset.

TYPE: Series | DataFrame

dependancy_values

A dictionary of dependency values.

TYPE: DSdict

RETURNS DESCRIPTION
S

The value of this statistic.

Source code in src/amltk/metalearning/metafeatures.py
@classmethod
@abstractmethod
def compute(
    cls,
    x: pd.DataFrame,
    y: pd.Series | pd.DataFrame,
    dependancy_values: DSdict,
) -> S:
    """Compute the value of this statistic.

    Args:
        x: The features of the dataset.
        y: The labels of the dataset.
        dependancy_values: A dictionary of dependency values.

    Returns:
        The value of this statistic.
    """

def retrieve(dependancy_values)
classmethod
#

Retrieve the value of this statistic from the dependency values.

PARAMETER DESCRIPTION
dependancy_values

A dictionary of dependency values.

TYPE: Mapping[type[DatasetStatistic[T]], T]

RETURNS DESCRIPTION
S

The value of this statistic.

Source code in src/amltk/metalearning/metafeatures.py
@classmethod
def retrieve(
    cls,
    dependancy_values: Mapping[type[DatasetStatistic[T]], T],
) -> S:
    """Retrieve the value of this statistic from the dependency values.

    Args:
        dependancy_values: A dictionary of dependency values.

    Returns:
        The value of this statistic.
    """
    return dependancy_values[cls]  # type: ignore

class MetaFeature #

Bases: DatasetStatistic[M]

Used to indicate a metafeature to include.

This differs from DatasetStatistic in that it must return a single value.

skip: bool
classvar
#

Whether to skip this metafeature when iter() is called.

def iter()
classmethod
#

Return all the subclasses of MetaFeature.

Source code in src/amltk/metalearning/metafeatures.py
@classmethod
def iter(cls) -> Iterator[type[MetaFeature]]:
    """Return all the subclasses of MetaFeature."""
    for c in cls.__subclasses__():
        if not c.skip:
            yield c

class NAValues #

Bases: DatasetStatistic[DataFrame]

Mask of missing values in the dataset.

class ClassImbalanceRatios #

Bases: DatasetStatistic[tuple[Series, float]]

Imbalance ratios of each class in the dataset.

Will return the ratios of each class, the ratio expected if perfectly balanced,

class CategoricalImbalanceRatios #

Bases: DatasetStatistic[dict[str, tuple[Series, float]]]

Imbalance ratios of each class in the dataset.

Will return the ratios of each class, the ratio expected if perfectly balanced,

class CategoricalColumns #

Bases: DatasetStatistic[DataFrame]

The categorical columns in the dataset.

class NumericalColumns #

Bases: DatasetStatistic[DataFrame]

The numerical columns in the dataset.

class InstanceCount #

Bases: MetaFeature[int]

Number of instances in the dataset.

class LogInstanceCount #

Bases: MetaFeature[float]

Logarithm of the number of instances in the dataset.

class NumberOfClasses #

Bases: MetaFeature[int]

Number of classes in the dataset.

class NumberOfFeatures #

Bases: MetaFeature[int]

Number of features in the dataset.

class LogNumberOfFeatures #

Bases: MetaFeature[float]

Logarithm of the number of features in the dataset.

class PercentageMissingValues #

Bases: MetaFeature[float]

Percentage of missing values in the dataset.

class PercentageOfInstancesWithMissingValues #

Bases: MetaFeature[float]

Percentage of instances with missing values.

class PercentageOfFeaturesWithMissingValues #

Bases: MetaFeature[float]

Percentage of features with missing values.

class PercentageOfCategoricalColumnsWithMissingValues #

Bases: MetaFeature[float]

Percentage of categorical columns with missing values.

class PercentageOfCategoricalValuesWithMissingValues #

Bases: MetaFeature[float]

Percentage of categorical values with missing values.

class PercentageOfNumericColumnsWithMissingValues #

Bases: MetaFeature[float]

Percentage of numeric columns with missing values.

class PercentageOfNumericValuesWithMissingValues #

Bases: MetaFeature[float]

Percentage of numeric values with missing values.

class NumberOfNumericFeatures #

Bases: MetaFeature[int]

Number of numeric features in the dataset.

class NumberOfCategoricalFeatures #

Bases: MetaFeature[int]

Number of categorical features in the dataset.

class RatioNumericalFeatures #

Bases: MetaFeature[float]

Ratio of numerical features to total features in the dataset.

class RatioCategoricalFeatures #

Bases: MetaFeature[float]

Ratio of categoricals features to total features in the dataset.

class RatioFeaturesToInstances #

Bases: MetaFeature[float]

Ratio of features to instances in the dataset.

class ClassCounts #

Bases: DatasetStatistic[Series]

Number of instances per class.

class MinorityClassImbalance #

Bases: MetaFeature[float]

Imbalance of the minority class in the dataset. 0 => Balanced. 1 imbalanced.

class MajorityClassImbalance #

Bases: MetaFeature[float]

Imbalance of the majority class in the dataset. 0 => Balanced. 1 imbalanced.

class ClassImbalance #

Bases: MetaFeature[float]

Mean Target Imbalance of the classes in general.

0 => Balanced. 1 Imbalanced.

class ImbalancePerCategory #

Bases: DatasetStatistic[dict[str, float]]

Imbalance of each categorical feature. 0 => Balanced. 1 most imbalanced.

No categories implies perfectly balanced.

class MeanCategoricalImbalance #

Bases: MetaFeature[float]

The mean imbalance of categorical features.

class StdCategoricalImbalance #

Bases: MetaFeature[float]

The std imbalance of categorical features.

class SkewnessPerNumericalColumn #

Bases: DatasetStatistic[dict[str, float]]

Skewness of each numerical feature.

class SkewnessMean #

Bases: MetaFeature[float]

The mean skewness of numerical features.

class SkewnessStd #

Bases: MetaFeature[float]

The std skewness of numerical features.

class SkewnessMin #

Bases: MetaFeature[float]

The min skewness of numerical features.

class SkewnessMax #

Bases: MetaFeature[float]

The max skewness of numerical features.

class KurtosisPerNumericalColumn #

Bases: DatasetStatistic[dict[str, float]]

Kurtosis of each numerical feature.

class KurtosisMean #

Bases: MetaFeature[float]

The mean kurtosis of numerical features.

class KurtosisStd #

Bases: MetaFeature[float]

The std kurtosis of numerical features.

class KurtosisMin #

Bases: MetaFeature[float]

The min kurtosis of numerical features.

class KurtosisMax #

Bases: MetaFeature[float]

The max kurtosis of numerical features.

def imbalance_ratios(col) #

Compute the imbalance ratio of a categorical column.

This is done by computing the distance of each item's ratio to what a perfectly balanced ratio would be. We then sum up the distances, dividing by the worst case to normalize between 0 and 1.

PARAMETER DESCRIPTION
col

A column of values. If a DataFrame, the values from the subset of columns will be used.

TYPE: Series | DataFrame

RETURNS DESCRIPTION
Series

A tuple of the imbalance ratios, sorted from lowest (0) to highest (1)

float

and the expected ratio if perfectly balanced.

Source code in src/amltk/metalearning/metafeatures.py
def imbalance_ratios(col: pd.Series | pd.DataFrame) -> tuple[pd.Series, float]:
    """Compute the imbalance ratio of a categorical column.

    This is done by computing the distance of each item's ratio to what
    a perfectly balanced ratio would be. We then sum up the distances,
    dividing by the worst case to normalize between 0 and 1.

    Args:
        col: A column of values. If a DataFrame, the values from the subset of columns
            will be used.

    Returns:
        A tuple of the imbalance ratios, sorted from lowest (0) to highest (1)
        and the expected ratio if perfectly balanced.
    """
    ratios = col.value_counts(dropna=True, normalize=True, ascending=True)
    if len(ratios) == 1:
        return ratios, 1.0

    n_uniq = len(ratios)

    # A balanced ratio is one where all items are equally distributed
    balanced_ratio = float(1 / n_uniq)
    return ratios, balanced_ratio

def column_imbalance(ratios, balanced_ratio) #

Compute the imbalance of a column.

This is done by computing the distance of each item's ratio to what a perfectly balanced ratio would be. We then sum up the distances, dividing by the worst case to normalize between 0 and 1. 0 indicates a perfectly balanced column, 1 indicates a column where all items are of the same type.

PARAMETER DESCRIPTION
ratios

The ratios of each item in the column.

TYPE: Series

balanced_ratio

The ratio of a column if perfectly balanced.

TYPE: float

RETURNS DESCRIPTION
float

The imbalance of the column.

Source code in src/amltk/metalearning/metafeatures.py
def column_imbalance(ratios: pd.Series, balanced_ratio: float) -> float:
    """Compute the imbalance of a column.

    This is done by computing the distance of each item's ratio to what
    a perfectly balanced ratio would be. We then sum up the distances,
    dividing by the worst case to normalize between 0 and 1. 0 indicates
    a perfectly balanced column, 1 indicates a column where all items
    are of the same type.

    Args:
        ratios: The ratios of each item in the column.
        balanced_ratio: The ratio of a column if perfectly balanced.

    Returns:
        The imbalance of the column.
    """
    item_ratios_distance_from_balanced_ratio = np.abs(ratios - balanced_ratio)

    # The most imbalanced dataset would be one where we somehow have 0
    # items of each type **except** 1 type, which has all the instances.

    # In the case of a symbol group with 0 instance, their distance to the balanced
    # ratio is just the balanced ratio itself.
    zero_instance_ratio_distance = balanced_ratio
    dominant_ratio_distance = np.abs(1 - balanced_ratio)
    n_items = len(ratios)

    worst = (n_items - 1) * zero_instance_ratio_distance + dominant_ratio_distance
    normalizer = 1 / worst

    return float(normalizer * np.sum(item_ratios_distance_from_balanced_ratio))

def metafeature_descriptions(features=None) #

Get the descriptions of meatfeatures available.

PARAMETER DESCRIPTION
features

The metafeatures. If None, all metafeatures subclasses of MetaFeature will be returned.

TYPE: Iterable[type[DatasetStatistic]] | None DEFAULT: None

RETURNS DESCRIPTION
dict[str, str]

The descriptions of the metafeatures.

Source code in src/amltk/metalearning/metafeatures.py
def metafeature_descriptions(
    features: Iterable[type[DatasetStatistic]] | None = None,
) -> dict[str, str]:
    """Get the descriptions of meatfeatures available.

    Args:
        features: The metafeatures. If None, all metafeatures subclasses
            of [`MetaFeature`][amltk.metalearning.MetaFeature] will be returned.

    Returns:
        The descriptions of the metafeatures.
    """
    if features is None:
        features = MetaFeature.iter()

    return {mf.name(): mf.description() for mf in features}

def compute_metafeatures(X, y, *, features=None) #

Compute metafeatures for a dataset.

PARAMETER DESCRIPTION
X

The features of the dataset.

TYPE: DataFrame

y

The labels of the dataset.

TYPE: Series | DataFrame

features

The metafeatures to compute. If None, all metafeatures subclasses of MetaFeature will be computed.

TYPE: Iterable[type[MetaFeature]] | None DEFAULT: None

RETURNS DESCRIPTION
Series

A series of metafeatures.

Source code in src/amltk/metalearning/metafeatures.py
def compute_metafeatures(
    X: pd.DataFrame,  # noqa: N803
    y: pd.Series | pd.DataFrame,
    *,
    features: Iterable[type[MetaFeature]] | None = None,
) -> pd.Series:
    """Compute metafeatures for a dataset.

    Args:
        X: The features of the dataset.
        y: The labels of the dataset.
        features: The metafeatures to compute. If None, all metafeatures subclasses
            of [`MetaFeature`][amltk.metalearning.MetaFeature] will be computed.

    Returns:
        A series of metafeatures.
    """
    if features is None:
        features = MetaFeature.iter()

    def _calc(
        _x: pd.DataFrame,
        _y: pd.Series | pd.DataFrame,
        _metafeature: type[DatasetStatistic],
        _values: dict[type[DatasetStatistic], Any],
    ) -> dict[type[DatasetStatistic], Any]:
        for dep in _metafeature.dependencies:
            _values = _calc(_x, _y, dep, _values)

        if _metafeature not in _values:
            _values[_metafeature] = _metafeature.compute(_x, _y, _values)

        return _values

    values: dict[type[DatasetStatistic], Any] = {}
    for mf in features:
        values = _calc(X, y, mf, values)

    return pd.Series(
        {
            key.name(): value
            for key, value in values.items()
            if issubclass(key, MetaFeature)
        },
    )