Dtype reduction

amltk.data.dtype_reduction #

Reduce the dtypes of data.

reduce_dtypes #

reduce_dtypes(
    x: D,
    *,
    reduce_int: bool = True,
    reduce_float: bool = True
) -> D

Reduce the dtypes of data.

When a dataframe, will reduce the dtypes of all columns. When applied to an iterable, will apply to all elements of the iterable.

For an int array, will reduce to the smallest dtype that can hold the minimum and maximum values of the array. Otherwise for floats, will reduce by one step, i.e. float32 -> float16, float64 -> float32.

PARAMETER	DESCRIPTION
`x`	The data to reduce. TYPE: `D`
`reduce_int`	Whether to reduce integer dtypes. TYPE: `bool` DEFAULT: `True`
`reduce_float`	Whether to reduce floating point dtypes. TYPE: `bool` DEFAULT: `True`

Source code in src/amltk/data/dtype_reduction.py

def reduce_dtypes(x: D, *, reduce_int: bool = True, reduce_float: bool = True) -> D:
    """Reduce the dtypes of data.

    When a dataframe, will reduce the dtypes of all columns.
    When applied to an iterable, will apply to all elements of the iterable.

    For an int array, will reduce to the smallest dtype that can hold the
    minimum and maximum values of the array. Otherwise for floats, will reduce
    by one step, i.e. float32 -> float16, float64 -> float32.

    Args:
        x: The data to reduce.
        reduce_int: Whether to reduce integer dtypes.
        reduce_float: Whether to reduce floating point dtypes.
    """
    if not isinstance(x, pd.DataFrame | pd.Series | np.ndarray):
        raise TypeError(f"Cannot reduce data of type {type(x)}.")

    if isinstance(x, pd.Series | pd.DataFrame):
        x = x.convert_dtypes()

    if reduce_int:
        x = reduce_int_span(x)
    if reduce_float:
        x = reduce_floating_precision(x)

    return x

reduce_floating_precision #

reduce_floating_precision(x: D) -> D

Reduce the floating point precision of the data.

For a float array, will reduce by one step, i.e. float32 -> float16, float64 -> float32.

PARAMETER	DESCRIPTION
`x`	The data to reduce. TYPE: `D`

RETURNS	DESCRIPTION
`D`	The reduced data.

Source code in src/amltk/data/dtype_reduction.py

def reduce_floating_precision(x: D) -> D:
    """Reduce the floating point precision of the data.

    For a float array, will reduce by one step, i.e. float32 -> float16, float64
    -> float32.

    Args:
        x: The data to reduce.

    Returns:
        The reduced data.
    """
    # For a dataframe, we recurse over all columns
    if isinstance(x, pd.DataFrame):
        # Using `apply` doesn't work
        for col in x.columns:
            x[col] = reduce_floating_precision(x[col])
        return x  # type: ignore

    if x.dtype.kind != "f":
        return x

    _reduction_map = {
        # Base numpy dtypes
        "float128": "float64",
        "float96": "float64",
        "float64": "float32",
        "float32": "float16",
        # Nullable pandas dtypes (only supports 64 and 32 bit)
        "Float64": "Float32",
    }

    if (dtype := _reduction_map.get(x.dtype.name)) is not None:
        return x.astype(dtype)  # type: ignore

    return x

reduce_int_span #

reduce_int_span(x: D) -> D

Reduce the integer span of the data.

For an int array, will reduce to the smallest dtype that can hold the minimum and maximum values of the array.

PARAMETER	DESCRIPTION
`x`	The data to reduce. TYPE: `D`

RETURNS	DESCRIPTION
`D`	The reduced data.

Source code in src/amltk/data/dtype_reduction.py

def reduce_int_span(x: D) -> D:
    """Reduce the integer span of the data.

    For an int array, will reduce to the smallest dtype that can hold the
    minimum and maximum values of the array.

    Args:
        x: The data to reduce.

    Returns:
        The reduced data.
    """
    # For a dataframe, we recurse over all columns
    if isinstance(x, pd.DataFrame):
        # Using `apply` doesn't work
        for col in x.columns:
            x[col] = reduce_int_span(x[col])
        return x  # type: ignore

    if x.dtype.kind not in "iu":
        return x

    min_dtype = np.min_scalar_type(x.min())  # type: ignore
    max_dtype = np.min_scalar_type(x.max())  # type: ignore
    dtype = np.result_type(min_dtype, max_dtype)

    # The above dtype is a numpy dtype and may not allow for nullable values,
    # which are permissible in pandas. `to_numeric` will convert to appropriate
    # pandas nullable dtypes.
    if isinstance(x, pd.Series):
        dc = "unsigned" if "uint" in dtype.name else "integer"
        return pd.to_numeric(x, downcast=dc)

    return x.astype(dtype)