Source code for remodeler.operations.summarize_column_names_op

"""Summarize the column names in a collection of tabular files."""

import pandas as pd
from hed.tools.analysis.column_name_summary import ColumnNameSummary
from remodeler.operations.base_op import BaseOp
from remodeler.operations.base_summary import BaseSummary


[docs] class SummarizeColumnNamesOp(BaseOp): """Summarize the column names in a collection of tabular files. Required remodeling parameters: - **summary_name** (*str*): The name of the summary. - **summary_filename** (*str*): Base filename of the summary. Optional remodeling parameters: - **append_timecode** (*bool*): If False (default), the timecode is not appended to the summary filename. The purpose is to check that all the tabular files have the same columns in same order. """ NAME = "summarize_column_names" PARAMS = { "type": "object", "properties": { "summary_name": {"type": "string", "description": "Name to use for the summary in titles."}, "summary_filename": {"type": "string", "description": "Name to use for the summary file name base."}, "append_timecode": { "type": "boolean", "description": "If true, the timecode is appended to the base filename so each run has a unique name.", }, }, "required": ["summary_name", "summary_filename"], "additionalProperties": False, } SUMMARY_TYPE = "column_names"
[docs] def __init__(self, parameters): """Constructor for summarize column names operation. Parameters: parameters (dict): Dictionary with the parameter values for required and optional parameters. """ super().__init__(parameters) self.summary_name = parameters["summary_name"] self.summary_filename = parameters["summary_filename"] self.append_timecode = parameters.get("append_timecode", False)
[docs] def do_op(self, dispatcher, df, name, sidecar=None) -> pd.DataFrame: """Create a column name summary for df. Parameters: dispatcher (Dispatcher): Manages the operation I/O. df (DataFrame): The DataFrame to be remodeled. name (str): Unique identifier for the dataframe -- often the original file path. sidecar (Sidecar or file-like): Not needed for this operation. Returns: DataFrame: A copy of df. Side effect: Updates the relevant summary. """ df_new = df.copy() summary = dispatcher.summary_dicts.get(self.summary_name, None) if not summary: summary = ColumnNamesSummary(self) dispatcher.summary_dicts[self.summary_name] = summary summary.update_summary({"name": name, "column_names": list(df_new.columns)}) return df_new
[docs] @staticmethod def validate_input_data(parameters) -> list: """Additional validation required of operation parameters not performed by JSON schema validator.""" return []
class ColumnNamesSummary(BaseSummary): """Manager for summaries of column names for a dataset.""" def __init__(self, sum_op): """Constructor for column name summary manager. Parameters: sum_op (SummarizeColumnNamesOp): Operation associated with this summary. """ super().__init__(sum_op) def update_summary(self, new_info): """Update the summary for a given tabular input file. Parameters: new_info (dict): A dictionary with the parameters needed to update a summary. Notes: - The summary information is kept in separate ColumnNameSummary objects for each file. - The summary needs a "name" str and a "column_names" list. - The summary uses ColumnNameSummary as the summary object. """ name = new_info["name"] if name not in self.summary_dict: self.summary_dict[name] = ColumnNameSummary(name=name) self.summary_dict[name].update(name, new_info["column_names"]) def get_details_dict(self, column_summary) -> dict: """Return the summary dictionary extracted from a ColumnNameSummary. Parameters: column_summary (ColumnNameSummary): A column name summary for the data file. Returns: dict - a dictionary with the summary information for column names. """ summary = column_summary.get_summary() return { "Name": summary["Summary name"], "Total events": "n/a", "Total files": summary["Number files"], "Files": list(column_summary.file_dict.keys()), "Specifics": {"Columns": summary["Columns"]}, } def merge_all_info(self) -> "ColumnNameSummary": """Create a ColumnNameSummary containing the overall dataset summary. Returns: ColumnNameSummary - the overall summary object for column names. """ all_sum = ColumnNameSummary(name="Dataset") for _key, counts in self.summary_dict.items(): for name, pos in counts.file_dict.items(): all_sum.update(name, counts.unique_headers[pos]) return all_sum def _get_result_string(self, name, summary, individual=False) -> str: """Return a formatted string with the summary for the indicated name. Parameters: name (str): Identifier (usually the filename) of the individual file. summary (dict): The dictionary of the summary results indexed by name. individual (bool): True if individual summary, False otherwise. Returns: str - The results in a printable format ready to be saved to a text file. Notes: This calls _get_dataset_string to get the overall summary string. """ if name == "Dataset": return self._get_dataset_string(summary, BaseSummary.DISPLAY_INDENT) columns = summary.get("Specifics", {}).get("Columns", []) if columns: return f"{BaseSummary.DISPLAY_INDENT}{str(columns[0])}" else: return "" @staticmethod def _get_dataset_string(result, indent=BaseSummary.DISPLAY_INDENT): """Return a string with the overall summary for all the tabular files. Parameters: result (dict): Dictionary of merged summary information. indent (str): String of blanks used as the amount to indent for readability. Returns: str: Formatted string suitable for saving in a file or printing. """ sum_list = [f"Dataset: Number of files={result.get('Total files', 0)}"] specifics = result.get("Specifics", {}) columns = specifics.get("Columns", {}) for element in columns: sum_list.append(f"{indent}Columns: {str(element['Column names'])}") for file in element.get("Files", []): sum_list.append(f"{indent}{indent}{file}") return "\n".join(sum_list)