Source code for datpro.datpro

import pandas as pd
import numpy as np
import altair as alt
from itertools import combinations
from typing import Dict, Union, Optional, List


[docs]
def summarize_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Summarizes numeric columns in a given DataFrame by calculating key statistical metrics.

    This function automatically detects numeric columns in the provided DataFrame and 
    returns a summary DataFrame containing the minimum, 25th percentile (Q1), median (50th percentile),
    75th percentile (Q3), and maximum values for each numeric column.

    Parameters
    ----------
    df : pandas.DataFrame
        The input DataFrame containing data to be summarized.

    Returns
    -------
    pandas.DataFrame
        A DataFrame where each row corresponds to a numeric column in the input DataFrame,
        and the columns represent the calculated statistics: min, 25%, 50% (median), 75%, and max.

    Example
    -------
    >>> import pandas as pd
    >>> import numpy as np
    >>> data = {
    ...     "A": [1, 2, np.nan, 4],
    ...     "B": [100, 200, 300, 400],
    ...     "C": [1, 1, 1, 100]
    ... }
    >>> df = pd.DataFrame(data)
    >>> summarize_data(df)
         min   25%   50%   75%    max
    A    1.0   1.5   2.0   3.0    4.0
    B  100.0  175.0  250.0  325.0  400.0
    C    1.0   1.0   1.0   50.5  100.0
    """

	# Check if input is a DataFrame
    if not isinstance(df, pd.DataFrame):
        raise TypeError("Input must be a pandas DataFrame.")

    # Check if DataFrame is empty
    if df.empty:
        raise ValueError("The input DataFrame is empty.")

    # Select numeric columns
    numeric_cols = df.select_dtypes(include=['number'])

    # Check if there are numeric columns
    if numeric_cols.empty:
        raise ValueError("The DataFrame contains no numeric columns.")

    # Calculate summary statistics
    summary = numeric_cols.describe(percentiles=[0.25, 0.5, 0.75]).T

    # Select relevant statistics
    summary = summary[['min', '25%', '50%', '75%', 'max']]

    return summary



[docs]
def detect_anomalies(df: pd.DataFrame, anomaly_type: Optional[str] = None) -> Dict[str, Union[Dict[str, Dict[str, Union[int, float]]], str]]:
    """
    Detect anomalies in a dataframe, including missing values, outliers, and duplicates.
    
    Parameters
    ----------
    df : pandas.DataFrame
        The input dataframe to analyze.
    anomaly_type : str, optional
        Specify which anomaly to check ('missing_values', 'outliers', or 'duplicates').
        If None, all anomaly types will be checked.
    
    Returns
    -------
    dict
        A dictionary containing detected anomalies based on the specified anomaly_type.
    
    Example
    -------
    >>> import pandas as pd
    >>> data = {'A': [1, 2, np.nan, 4], 'B': [100, 200, 300, 400], 'C': [1, 1, 1, 100]}
    >>> df = pd.DataFrame(data)
    >>> detect_anomalies(df, anomaly_type='missing_values')
    {'missing_values': {'A': {'missing_count': 1, 'missing_percentage': 25.0}}}
    """
    if not isinstance(df, pd.DataFrame):
        raise TypeError("Input must be a pandas DataFrame.")
    
    report = {}
    total_rows = len(df)
    
    if anomaly_type is None or anomaly_type == 'missing_values':
        missing_values = df.isnull().sum()
        missing_info = {
            col: {
                "missing_count": int(missing_values[col]),
                "missing_percentage": round((missing_values[col] / total_rows) * 100, 2)
            }
            for col in df.columns if missing_values[col] > 0
        }
        report['missing_values'] = missing_info if missing_info else "No missing values detected."
    
    if anomaly_type is None or anomaly_type == 'outliers':
        outlier_info = {}
        for col in df.select_dtypes(include=[np.number]).columns:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col]
            if not outliers.empty:
                outlier_info[col] = {
                    "outlier_count": len(outliers),
                    "outlier_percentage": round((len(outliers) / total_rows) * 100, 2)
                }
        report['outliers'] = outlier_info if outlier_info else "No outliers detected."
    
    if anomaly_type is None or anomaly_type == 'duplicates':
        duplicate_count = df.duplicated().sum()
        report['duplicates'] = {
            "duplicate_count": duplicate_count,
            "duplicate_percentage": round((duplicate_count / total_rows) * 100, 2)
        } if duplicate_count > 0 else "No duplicate rows detected."
    
    return report



[docs]
def plotify(df: pd.DataFrame, plot_types: Optional[List[str]] = None, save: bool = False, save_path: str = "plots", file_prefix: str = "plot") -> Dict[str, alt.Chart]:
    """
    Visualize a DataFrame by generating specified plots based on column datatypes.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame containing the data to be visualized.
    
    plot_types : list of str, optional
        A list of plot types to generate. Available options include:
        - 'histogram' : Plot a histogram for numeric columns.
        - 'density' : Plot a density plot for numeric columns.
        - 'bar' : Plot a bar chart for categorical columns.
        - 'scatter' : Plot scatter plots for pairwise numeric columns.
        - 'correlation' : Plot a correlation heatmap for numeric columns.
        - 'box' : Plot box plots for numeric vs categorical columns.
        - 'stacked_bar' : Plot stacked bar charts for pairwise categorical columns.
        If None, all plot types are generated by default.
    
    save : bool, optional
        If True, saves the plots to the specified path. Default is False.
    
    save_path : str, optional
        The directory where plots should be saved. Default is 'plots'.
    
    file_prefix : str, optional
        The prefix for saved plot filenames. Default is 'plot'.
    
    Returns
    -------
    dict
        A dictionary where keys are plot names and values are Altair Chart objects.
    
    Raises
    ------
    TypeError
        If the input is not a pandas DataFrame.
    ValueError
        If the input DataFrame is empty.

    Notes
    -----
    - Numeric columns are those of types 'int64', 'float64'.
    - Categorical columns are those of types 'object', 'category', and 'bool'.

    Examples
    --------
    >>> import pandas as pd
    >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': ['x', 'y', 'x', 'y']})
    >>> charts = plotify(df, plot_types=['histogram', 'bar'])
    >>> charts['histogram_A'].show()
    """
    import os
    
    # Validate input
    if not isinstance(df, pd.DataFrame):
        raise TypeError("Input must be a pandas DataFrame.")
    if df.empty:
        raise ValueError("Input DataFrame is empty.")
    
    if save and not os.path.exists(save_path):
        os.makedirs(save_path)

    # Set default plot types if not specified
    if plot_types is None:
        plot_types = ['histogram', 'density', 'bar', 'scatter', 'correlation', 'box', 'stacked_bar']
    
    # Analyze columns
    numeric_cols = df.select_dtypes(include='number').columns.tolist()
    categorical_cols = df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
    
    plots = {}

    # Individual column visualizations
    if 'histogram' in plot_types or 'density' in plot_types:
        for col in numeric_cols:
            if 'histogram' in plot_types:
                hist_chart = alt.Chart(df).mark_bar().encode(
                    x=alt.X(col, bin=True, title=f"{col} (binned)"),
                    y=alt.Y('count()', title='Count')
                ).properties(title=f"Histogram of {col}")
                plots[f'histogram_{col}'] = hist_chart
                if save:
                    hist_chart.save(f"{save_path}/{file_prefix}_histogram_{col}.html")
            if 'density' in plot_types:
                density_chart = alt.Chart(df).transform_density(
                    col, as_=[col, 'density']
                ).mark_area(opacity=0.5).encode(
                    x=alt.X(col, title=col),
                    y=alt.Y('density:Q', title='Density')
                ).properties(title=f"Density Plot of {col}")
                plots[f'density_{col}'] = density_chart
                if save:
                    density_chart.save(f"{save_path}/{file_prefix}_density_{col}.html")
    
    if 'bar' in plot_types:
        for col in categorical_cols:
            bar_chart = alt.Chart(df).mark_bar().encode(
                x=alt.X(col, title=col),
                y=alt.Y('count()', title='Count')
            ).properties(title=f"Bar Chart of {col}")
            plots[f'bar_{col}'] = bar_chart
            if save:
                bar_chart.save(f"{save_path}/{file_prefix}_bar_{col}.html")
    
    if 'scatter' in plot_types:
        for col1, col2 in combinations(numeric_cols, 2):
            scatter_chart = alt.Chart(df).mark_circle(size=60).encode(
                x=alt.X(col1, title=col1),
                y=alt.Y(col2, title=col2),
                tooltip=[col1, col2]
            ).properties(title=f"Scatter Plot: {col1} vs {col2}")
            plots[f'scatter_{col1}_{col2}'] = scatter_chart
            if save:
                scatter_chart.save(f"{save_path}/{file_prefix}_scatter_{col1}_{col2}.html")
    
    if 'correlation' in plot_types and len(numeric_cols) > 1:
        corr_matrix = df[numeric_cols].corr().stack().reset_index()
        corr_matrix.columns = ['Variable 1', 'Variable 2', 'Correlation']
        heatmap = alt.Chart(corr_matrix).mark_rect().encode(
            x=alt.X('Variable 1:N'),
            y=alt.Y('Variable 2:N'),
            color=alt.Color('Correlation:Q', scale=alt.Scale(scheme='viridis'))
        ).properties(title='Correlation Heatmap')
        plots['correlation_heatmap'] = heatmap
        if save:
            heatmap.save(f"{save_path}/{file_prefix}_correlation_heatmap.html")
    
    if 'box' in plot_types:
        for numeric_col in numeric_cols:
            for categorical_col in categorical_cols:
                box_plot = alt.Chart(df).mark_boxplot().encode(
                    x=alt.X(categorical_col, title=categorical_col),
                    y=alt.Y(numeric_col, title=numeric_col)
                ).properties(title=f"Box Plot of {numeric_col} by {categorical_col}")
                plots[f'box_{numeric_col}_{categorical_col}'] = box_plot
                if save:
                    box_plot.save(f"{save_path}/{file_prefix}_box_{numeric_col}_{categorical_col}.html")
    
    if 'stacked_bar' in plot_types:
        for col1, col2 in combinations(categorical_cols, 2):
            stacked_bar_chart = alt.Chart(df).mark_bar().encode(
                x=alt.X(col1, title=col1),
                y=alt.Y('count()', title='Count'),
                color=alt.Color(col2, title=col2)
            ).properties(title=f"Stacked Bar Chart of {col1} vs {col2}")
            plots[f'stacked_bar_{col1}_{col2}'] = stacked_bar_chart
            if save:
                stacked_bar_chart.save(f"{save_path}/{file_prefix}_stacked_bar_{col1}_{col2}.html")
    
    return plots