Source code for remodeler.operations.summarize_sidecar_from_events_op

"""Create a JSON sidecar from column values in a collection of tabular files."""

import json
from hed.tools.analysis.tabular_summary import TabularSummary
from remodeler.operations.base_op import BaseOp
from remodeler.operations.base_summary import BaseSummary



[docs]
class SummarizeSidecarFromEventsOp(BaseOp):
    """Create a JSON sidecar from column values in a collection of tabular files.

    Required remodeling parameters:
        - **summary_name** (*str*): The name of the summary.
        - **summary_filename** (*str*): Base filename of the summary.

    Optional remodeling parameters:
        - **append_timecode** (*bool*):
        - **skip_columns** (*list*): Names of columns to skip in the summary.
        - **value_columns** (*list*): Names of columns to treat as value columns rather than categorical columns.

    The purpose is to produce a JSON sidecar template for annotating a dataset with HED tags.

    """

    NAME = "summarize_sidecar_from_events"

    PARAMS = {
        "type": "object",
        "properties": {
            "summary_name": {"type": "string", "description": "Name to use for the summary in titles."},
            "summary_filename": {"type": "string", "description": "Name to use for the summary file name base."},
            "skip_columns": {
                "type": "array",
                "description": "List of columns to skip in generating the sidecar.",
                "items": {"type": "string"},
                "minItems": 1,
                "uniqueItems": True,
            },
            "value_columns": {
                "type": "array",
                "description": "List of columns to provide a single annotation with placeholder for the values.",
                "items": {"type": "string"},
                "minItems": 1,
                "uniqueItems": True,
            },
            "append_timecode": {"type": "boolean"},
        },
        "required": ["summary_name", "summary_filename"],
        "additionalProperties": False,
    }

    SUMMARY_TYPE = "events_to_sidecar"


[docs]
    def __init__(self, parameters):
        """Constructor for summarize sidecar from events operation.

        Parameters:
            parameters (dict): Dictionary with the parameter values for required and optional parameters.

        """

        super().__init__(parameters)
        self.summary_name = parameters["summary_name"]
        self.summary_filename = parameters["summary_filename"]
        self.skip_columns = parameters.get("skip_columns", None)
        self.value_columns = parameters.get("value_columns", None)
        self.append_timecode = parameters.get("append_timecode", False)



[docs]
    def do_op(self, dispatcher, df, name, sidecar=None):
        """Extract a sidecar from events file.

        Parameters:
            dispatcher (Dispatcher): The dispatcher object for managing the operations.
            df (DataFrame): The tabular file to be remodeled.
            name (str): Unique identifier for the dataframe -- often the original file path.
            sidecar (Sidecar or file-like): Not needed for this operation.

        Returns:
            DataFrame: A copy of df.

        Side effect:
            Updates the associated summary if applicable.

        """

        df_new = df.copy()
        summary = dispatcher.summary_dicts.get(self.summary_name, None)
        if not summary:
            summary = EventsToSidecarSummary(self)
            dispatcher.summary_dicts[self.summary_name] = summary
        summary.update_summary({"df": dispatcher.post_proc_data(df_new), "name": name})
        return df_new



[docs]
    @staticmethod
    def validate_input_data(parameters):
        """Additional validation required of operation parameters not performed by JSON schema validator."""
        return []




class EventsToSidecarSummary(BaseSummary):
    """Manager for events to sidecar generation."""

    def __init__(self, sum_op):
        """Constructor for events to sidecar manager.

        Parameters:
            sum_op (BaseOp): Operation associated with this summary.

        """
        super().__init__(sum_op)
        self.value_cols = sum_op.value_columns
        self.skip_cols = sum_op.skip_columns

    def update_summary(self, new_info):
        """Update the summary for a given tabular input file.

        Parameters:
            new_info (dict):  A dictionary with the parameters needed to update a summary.

        Notes:
            - The summary needs a "name" str and a "df".

        """

        tab_sum = TabularSummary(value_cols=self.value_cols, skip_cols=self.skip_cols, name=new_info["name"])
        tab_sum.update(new_info["df"], new_info["name"])
        self.summary_dict[new_info["name"]] = tab_sum

    def get_details_dict(self, summary_info):
        """Return the summary-specific information.

        Parameters:
            summary_info (TabularSummary):  Summary to return info from.

        Returns:
            dict: Standardized details dictionary extracted from the summary information.

        Notes:
            Abstract method be implemented by each individual context summary.

        """

        return {
            "Name": summary_info.name,
            "Total events": summary_info.total_events,
            "Total files": summary_info.total_files,
            "Files": list(summary_info.files.keys()),
            "Specifics": {
                "Categorical info": summary_info.categorical_info,
                "Value info": summary_info.value_info,
                "Skip columns": summary_info.skip_cols,
                "Sidecar": summary_info.extract_sidecar_template(),
            },
        }

    def merge_all_info(self):
        """Merge summary information from all the files.

        Returns:
           TabularSummary:  Consolidated summary of information.

        """

        all_sum = TabularSummary(name="Dataset")
        for _key, tab_sum in self.summary_dict.items():
            all_sum.update_summary(tab_sum)
        return all_sum

    def _get_result_string(self, name, result, indent=BaseSummary.DISPLAY_INDENT):
        """Return a formatted string with the summary for the indicated name.

        Parameters:
            name (str):  Identifier (usually the filename) of the individual file.
            result (dict): The dictionary of the summary results indexed by name.
            indent (str): A string containing spaces used for indentation (usually 3 spaces).

        Returns:
            str: The results in a printable format ready to be saved to a text file.

        Notes:
            This calls _get_dataset_string to get the overall summary string and
            _get_individual_string to get an individual summary string.

        """

        if name == "Dataset":
            return self._get_dataset_string(result, indent=indent)
        return self._get_individual_string(result, indent=indent)

    @staticmethod
    def _get_dataset_string(result, indent=BaseSummary.DISPLAY_INDENT):
        """Return  a string with the overall summary for all the tabular files.

        Parameters:
            result (dict): Dictionary of merged summary information.
            indent (str):  String of blanks used as the amount to indent for readability.

        Returns:
            str: Formatted string suitable for saving in a file or printing.

        """
        specifics = result.get("Specifics", {})
        sum_list = [
            f"Dataset: Total events={result.get('Total events', 0)} Total files={result.get('Total files', 0)}",
            f"Skip columns: {str(specifics.get('Skip columns', []))}",
            f"Value columns: {str(specifics.get('Value info', {}).keys())}",
            f"Sidecar:\n{json.dumps(specifics.get('Sidecar', {}), indent=indent)}",
        ]
        return "\n".join(sum_list)

    @staticmethod
    def _get_individual_string(result, indent=BaseSummary.DISPLAY_INDENT):
        """Return  a string with the summary for an individual tabular file.

        Parameters:
            result (dict): Dictionary of summary information for a particular tabular file.
            indent (str):  String of blanks used as the amount to indent for readability.

        Returns:
            str: Formatted string suitable for saving in a file or printing.

        """
        specifics = result.get("Specifics", {})
        sum_list = [
            f"Total events={result.get('Total events', 0)}",
            f"Skip columns: {str(specifics.get('Slip columns', []))}",
            f"Value columns: {str(specifics.get('Value info', {}).keys())}",
            f"Sidecar:\n{json.dumps(specifics['Sidecar'], indent=indent)}",
        ]
        return "\n".join(sum_list)

    @staticmethod
    def validate_input_data(parameters):
        return []