Source code for dsp_pandas.df

"""Helpers to identify interesting data in a pandas DataFrame."""

from types import SimpleNamespace

import pandas as pd

__all__ = ["get_unique_and_non_unique_columns"]


def unique_cols(s: pd.Series) -> bool:
    """Check all entries are equal in pandas.Series

    Ref: https://stackoverflow.com/a/54405767/968487

    Parameters
    ----------
    s : pandas.Series
        Series to check uniqueness

    Returns
    -------
    bool
        Boolean on if all values are equal.
    """
    return (s.iloc[0] == s).all()


[docs] def get_unique_and_non_unique_columns(df: pd.DataFrame) -> SimpleNamespace: """Get back a namespace with an column.Index both of the unique and non-unique columns. Parameters ---------- df : pandas.DataFrame DataFrame to check for unique columns. Returns ------- types.SimpleNamespace SimpleNamespace with `unique` and `non_unique` column names indices. """ mask_unique_columns = df.apply(unique_cols) columns = SimpleNamespace() columns.unique = df.columns[mask_unique_columns] columns.non_unique = df.columns[~mask_unique_columns] return columns
def drop_unique_columns(df: pd.DataFrame) -> pd.DataFrame: """Filter out non-unique columns from a DataFrame.""" return df[get_unique_and_non_unique_columns(df).non_unique] def drop_non_unique_columns(df: pd.DataFrame) -> pd.DataFrame: """Filter out non-unique columns from a DataFrame.""" return df[get_unique_and_non_unique_columns(df).non_unique] def combine_value_counts(X: pd.DataFrame, dropna=True) -> pd.DataFrame: """Pass a selection of columns to combine it's value counts. This performs no checks. Make sure the scale of the variables you pass is comparable. Parameters ---------- X : pandas.DataFrame A DataFrame of several columns with values in a similar range. dropna : bool, optional Exclude NA values from counting, by default True Returns ------- pandas.DataFrame DataFrame of combined value counts. """ freq_targets = list() for col in X.columns: freq_targets.append(X[col].value_counts(dropna=dropna).rename(col)) freq_targets = pd.concat(freq_targets, axis=1, sort=True) return freq_targets