Source code for remodeler.operations.summarize_column_values_op

"""Summarize the values in the columns of a columnar file."""

import pandas as pd
from hed.tools.analysis.tabular_summary import TabularSummary
from remodeler.operations.base_op import BaseOp
from remodeler.operations.base_summary import BaseSummary



[docs]
class SummarizeColumnValuesOp(BaseOp):
    """Summarize the values in the columns of a columnar file.

    Required remodeling parameters:
        - **summary_name** (*str*): The name of the summary.
        - **summary_filename** (*str*): Base filename of the summary.

    Optional remodeling parameters:
        - **append_timecode** (*bool*): (**Optional**: Default False) If True append timecodes to the summary filename.
        - **max_categorical** (*int*): Maximum number of unique values to include in summary for a categorical column.
        - **skip_columns** (*list*):  Names of columns to skip in the summary.
        - **value_columns** (*list*): Names of columns to treat as value columns rather than categorical columns.
        - **values_per_line** (*int*): The number of values output per line in the summary.

    The purpose is to produce a summary of the values in a tabular file.

    """

    NAME = "summarize_column_values"

    PARAMS = {
        "type": "object",
        "properties": {
            "summary_name": {"type": "string", "description": "Name to use for the summary in titles."},
            "summary_filename": {"type": "string", "description": "Name to use for the summary file name base."},
            "append_timecode": {
                "type": "boolean",
                "description": "If true, the timecode is appended to the base filename so each run has a unique name.",
            },
            "max_categorical": {
                "type": "integer",
                "description": "Maximum number of unique column values to show in text description.",
            },
            "skip_columns": {
                "type": "array",
                "description": "List of columns to skip when creating the summary.",
                "items": {"type": "string"},
                "minItems": 1,
                "uniqueItems": True,
            },
            "value_columns": {
                "type": "array",
                "description": "Columns to be annotated with a single HED annotation and placeholder.",
                "items": {"type": "string"},
                "minItems": 1,
                "uniqueItems": True,
            },
            "values_per_line": {
                "type": "integer",
                "description": "Number of items per line to display in the text file.",
            },
        },
        "required": ["summary_name", "summary_filename"],
        "additionalProperties": False,
    }

    SUMMARY_TYPE = "column_values"
    VALUES_PER_LINE = 5
    MAX_CATEGORICAL = 50


[docs]
    def __init__(self, parameters):
        """Constructor for the summarize column values operation.

        Parameters:
            parameters (dict): Dictionary with the parameter values for required and optional parameters.

        """

        super().__init__(parameters)
        self.summary_name = parameters["summary_name"]
        self.summary_filename = parameters["summary_filename"]
        self.append_timecode = parameters.get("append_timecode", False)
        self.max_categorical = parameters.get("max_categorical", float("inf"))
        self.skip_columns = parameters.get("skip_columns", [])
        self.value_columns = parameters.get("value_columns", [])
        self.values_per_line = parameters.get("values_per_line", self.VALUES_PER_LINE)



[docs]
    def do_op(self, dispatcher, df, name, sidecar=None) -> pd.DataFrame:
        """Create a summary of the column values in df.

        Parameters:
            dispatcher (Dispatcher): Manages the operation I/O.
            df (DataFrame): The DataFrame to be remodeled.
            name (str):  Unique identifier for the dataframe -- often the original file path.
            sidecar (Sidecar or file-like): Not needed for this operation.

        Returns:
            DataFrame: A copy of df.

        Side effect:
            Updates the relevant summary.

        """

        df_new = df.copy()
        summary = dispatcher.summary_dicts.get(self.summary_name, None)
        if not summary:
            summary = ColumnValueSummary(self)
            dispatcher.summary_dicts[self.summary_name] = summary
        summary.update_summary({"df": dispatcher.post_proc_data(df_new), "name": name})
        return df_new



[docs]
    @staticmethod
    def validate_input_data(parameters) -> list:
        """Additional validation required of operation parameters not performed by JSON schema validator."""
        return []




class ColumnValueSummary(BaseSummary):
    """Manager for summaries of column contents for columnar files."""

    def __init__(self, sum_op):
        """Constructor for column value summary manager.

        Parameters:
            sum_op (SummarizeColumnValuesOp): Operation associated with this summary.

        """
        super().__init__(sum_op)

    def update_summary(self, new_info):
        """Update the summary for a given tabular input file.

        Parameters:
            new_info (dict):  A dictionary with the parameters needed to update a summary.

        Notes:
            - The summary information is kept in separate TabularSummary objects for each file.
            - The summary needs a "name" str and a "df" .

        """
        name = new_info["name"]
        if name not in self.summary_dict:
            self.summary_dict[name] = TabularSummary(
                value_cols=self.op.value_columns, skip_cols=self.op.skip_columns, name=name
            )
        self.summary_dict[name].update(new_info["df"])

    def get_details_dict(self, tabular_summary) -> dict:
        """Return a dictionary with the summary contained in a TabularSummary.

        Parameters:
            tabular_summary (TabularSummary): The tabular summary object.

        Returns:
            dict: Dictionary with the information suitable for extracting printout.

        """
        this_summary = tabular_summary.get_summary(as_json=False)
        unique_counts = [(key, len(count_dict)) for key, count_dict in this_summary["Categorical columns"].items()]
        this_summary["Categorical counts"] = dict(unique_counts)
        for key, dict_entry in this_summary["Categorical columns"].items():
            num_disp, sorted_tuples = ColumnValueSummary.sort_dict(dict_entry, reverse=True)
            this_summary["Categorical columns"][key] = dict(sorted_tuples[: min(num_disp, self.op.max_categorical)])
        return {
            "Name": this_summary["Name"],
            "Total events": this_summary["Total events"],
            "Total files": this_summary["Total files"],
            "Files": list(this_summary["Files"].keys()),
            "Specifics": {
                "Value columns": list(this_summary["Value columns"]),
                "Skip columns": this_summary["Skip columns"],
                "Value column summaries": this_summary["Value columns"],
                "Categorical column summaries": this_summary["Categorical columns"],
                "Categorical counts": this_summary["Categorical counts"],
            },
        }

    def merge_all_info(self) -> "TabularSummary":
        """Create a TabularSummary containing the overall dataset summary.

        Returns:
            TabularSummary - the summary object for column values.

        """
        all_sum = TabularSummary(value_cols=self.op.value_columns, skip_cols=self.op.skip_columns, name="Dataset")
        for counts in self.summary_dict.values():
            all_sum.update_summary(counts)
        return all_sum

    def _get_result_string(self, name, summary, individual=False) -> str:
        """Return a formatted string with the summary for the indicated name.

        Parameters:
            name (str):  Identifier (usually the filename) of the individual file.
            summary (dict): The dictionary of the summary results indexed by name.
            individual (bool): Whether this is for an individual file summary.

        Returns:
            str: The results in a printable format ready to be saved to a text file.

        Notes:
            This calls _get_dataset_string to get the overall summary string and
            _get_individual_string to get an individual summary string.

        """

        if name == "Dataset":
            sum_list = [
                f"Dataset: Total events={summary.get('Total events', 0)} Total files={summary.get('Total files', 0)}"
            ]
        else:
            sum_list = [f"Total events={summary.get('Total events', 0)}"]
        sum_list = sum_list + self._get_detail_list(summary, indent=BaseSummary.DISPLAY_INDENT)
        return "\n".join(sum_list)

    def _get_individual_string(self, result, indent=BaseSummary.DISPLAY_INDENT) -> str:
        """Return a formatted string with the summary for an individual file.

        Parameters:
            result (dict): The dictionary of the summary results indexed by name.
            indent (str): A string containing spaces used for indentation (usually 3 spaces).

        Returns:
            str: The results in a printable format ready to be saved to a text file.

        Notes:
            This calls _get_categorical_string to get the categorical part of the summary,
            and _get_value_string to get the value column part of the summary.

        """

        return "\n".join(
            [
                f"Summary for {result['Name']}",
                f"  Total events: {result.get('Total events', 0)}",
                f"  Total files: {result.get('Total files', 0)}",
                f"  Value columns: {result['Specifics']['Value columns']}",
                f"  Skip columns: {result['Specifics']['Skip columns']}",
                self._get_categorical_string(result),
                self._get_value_string(result["Specifics"]["Value column summaries"]),
            ]
        )

    def _format_categorical_lists(self, specifics) -> list:
        """Format the categorical column summaries for display.

        Parameters:
            specifics (dict): The specifics dictionary from the summary.

        Returns:
            list: A list of formatted strings for the categorical column summaries.

        """
        cat_dict = specifics.get("Categorical column summaries", {})
        if not cat_dict:
            return []
        count_dict = specifics["Categorical counts"]
        formatted_list = ["Categorical column values[Events, Files]:"]
        sorted_tuples = sorted(cat_dict.items(), key=lambda x: x[0])
        for entry in sorted_tuples:
            formatted_list = formatted_list + self._get_categorical_col(entry, count_dict, offset="", indent="   ")
        return formatted_list

    def _get_categorical_string(self, summary, offset="", indent="   "):
        """Return  a string with the summary for a particular categorical dictionary.

        Parameters:
            summary (dict): Dictionary of summary information for a particular tabular file.
            offset (str): String of blanks used as offset for every item
            indent (str):  String of blanks used as the additional amount to indent an item's for readability.

        Returns:
            str: Formatted string suitable for saving in a file or printing.

        """
        specifics = summary.get("Specifics", {})
        cat_dict = specifics.get("Categorical column summaries", {})
        if not cat_dict:
            return ""
        count_dict = specifics.get("Categorical counts", {})
        sum_list = [f"{offset}{indent}Categorical column values[Events, Files]:"]
        sorted_tuples = sorted(cat_dict.items(), key=lambda x: x[0])
        for entry in sorted_tuples:
            sum_list = sum_list + self._get_categorical_col(entry, count_dict, offset="", indent="   ")
        return "\n".join(sum_list)

    def _get_detail_list(self, result, indent=BaseSummary.DISPLAY_INDENT):
        """Return  a list of strings with the details

        Parameters:
            result (dict): Dictionary of merged summary information.
            indent (str):  String of blanks used as the amount to indent for readability.

        Returns:
            list: list of formatted strings suitable for saving in a file or printing.

        """
        sum_list = []
        specifics = result["Specifics"]
        cat_string = self._get_categorical_string(specifics, offset="", indent=indent)
        if cat_string:
            sum_list.append(cat_string)
        val_dict = specifics.get("Value column summaries", {})
        if val_dict:
            sum_list.append(ColumnValueSummary._get_value_string(val_dict, offset="", indent=indent))
        return sum_list

    def _get_categorical_col(self, entry, count_dict, offset="", indent="   "):
        """Return  a string with the summary for a particular categorical column.

        Parameters:
            entry(tuple): (Name of the column, summary dict for that column)
            count_dict (dict): Count of the total number of unique values indexed by the name
            offset(str): String of blanks used as offset for all items
            indent (str):  String of blanks used as the additional amount to indent for this item's readability.

        Returns:
            list: Formatted strings, each corresponding to a line in the output.
        """
        num_unique = count_dict[entry[0]]
        num_disp = min(self.op.max_categorical, num_unique)
        col_list = [f"{offset}{indent * 2}{entry[0]}: {num_unique} unique values (displaying top {num_disp} values)"]
        # Create and partition the list of individual entries
        value_list = [f"{item[0]}{str(item[1])}" for item in entry[1].items()]
        value_list = value_list[:num_disp]
        part_list = ColumnValueSummary.partition_list(value_list, self.op.values_per_line)
        return col_list + [f"{offset}{indent * 3}{ColumnValueSummary.get_list_str(item)}" for item in part_list]

    @staticmethod
    def get_list_str(lst) -> str:
        """Return a str version of a list with items separated by a blank.

        Returns:
            str:  String version of list.

        """
        return f"{' '.join(str(item) for item in lst)}"

    @staticmethod
    def partition_list(lst, n) -> list:
        """Partition a list into lists of n items.

        Parameters:
            lst (list): List to be partitioned.
            n (int):  Number of items in each sublist.

        Returns:
            list:  list of lists of n elements, the last might have fewer.

        """
        return [lst[i : i + n] for i in range(0, len(lst), n)]

    @staticmethod
    def _get_value_string(val_dict, offset="", indent="") -> str:
        sum_list = [f"{offset}{indent}Value columns[Events, Files]:"]
        for col_name, val_counts in val_dict.items():
            sum_list.append(f"{offset}{indent * 2}{col_name}{str(val_counts)}")
        return "\n".join(sum_list)

    @staticmethod
    def sort_dict(count_dict, reverse=False):
        sorted_tuples = sorted(count_dict.items(), key=lambda x: x[1][0], reverse=reverse)
        return len(sorted_tuples), sorted_tuples