Source code for remodeler.operations.split_rows_op

"""Split rows in a columnar file with  onset and duration columns into multiple rows based on a specified column."""

import numpy as np
import pandas as pd
from remodeler.operations.base_op import BaseOp


[docs] class SplitRowsOp(BaseOp): """Split rows in a columnar file with onset and duration columns into multiple rows based on a specified column. Required remodeling parameters: - **anchor_column** (*str*): The column in which the names of new items are stored. - **new_events** (*dict*): Mapping of new values based on values in the original row. - **remove_parent_row** (*bool*): If true, the original row that was split is removed. Notes: - In specifying onset and duration for the new row, you can give values or the names of columns as strings. """ NAME = "split_rows" PARAMS = { "type": "object", "properties": { "anchor_column": { "type": "string", "description": "The column containing the keys for the new rows. (Original rows will have own keys.)", }, "new_events": { "type": "object", "description": "A map describing how the rows for the new codes will be created.", "patternProperties": { ".*": { "type": "object", "properties": { "onset_source": { "type": "array", "description": "List of items to add to compute the onset time of the new row.", "items": {"type": ["string", "number"]}, "minItems": 1, }, "duration": { "type": "array", "description": "List of items to add to compute the duration of the new row.", "items": {"type": ["string", "number"]}, "minItems": 1, }, "copy_columns": { "type": "array", "description": "List of columns whose values to copy for the new row.", "items": {"type": "string"}, "minItems": 1, "uniqueItems": True, }, }, "required": ["onset_source", "duration"], "additionalProperties": False, } }, "minProperties": 1, }, "remove_parent_row": { "type": "boolean", "description": "If true, the row from which these rows were split is removed, otherwise it stays.", }, }, "required": ["anchor_column", "new_events", "remove_parent_row"], "additionalProperties": False, }
[docs] def __init__(self, parameters): """Constructor for the split rows operation. Parameters: parameters (dict): Dictionary with the parameter values for required and optional parameters. """ super().__init__(parameters) self.anchor_column = parameters["anchor_column"] self.new_events = parameters["new_events"] self.remove_parent_row = parameters["remove_parent_row"]
[docs] def do_op(self, dispatcher, df, name, sidecar=None) -> pd.DataFrame: """Split a row representing a particular event into multiple rows. Parameters: dispatcher (Dispatcher): Manages the operation I/O. df (DataFrame): The DataFrame to be remodeled. name (str): Unique identifier for the dataframe -- often the original file path. sidecar (Sidecar or file-like): Not needed for this operation. Returns: Dataframe: A new dataframe after processing. Raises: TypeError: If bad onset or duration. """ if "onset" not in df.columns: raise ValueError("MissingOnsetColumn", f"{name}: Data must have an onset column for split_rows_op") elif "duration" not in df.columns: raise ValueError("MissingDurationColumn", f"{name}: Data must have an duration column for split_rows_op") df_new = df.copy() if self.anchor_column not in df_new.columns: df_new[self.anchor_column] = np.nan if self.remove_parent_row: df_list = [] else: df_list = [df_new] self._split_rows(df, df_list) df_ret = pd.concat(df_list, axis=0, ignore_index=True) df_ret["onset"] = df_ret["onset"].apply(pd.to_numeric) df_ret = df_ret.sort_values("onset").reset_index(drop=True) return df_ret
def _split_rows(self, df, df_list): """Split the rows based on an anchor and different columns. Parameters: df (DataFrame): The DataFrame to be split. df_list (list): The list of split events and possibly the """ for event, event_params in self.new_events.items(): add_events = pd.DataFrame([], columns=df.columns) add_events["onset"] = self._create_onsets(df, event_params["onset_source"]) add_events[self.anchor_column] = event self._add_durations(df, add_events, event_params["duration"]) if len(event_params["copy_columns"]) > 0: for column in event_params["copy_columns"]: add_events[column] = df[column] # add_events['event_type'] = event add_events = add_events.dropna(axis="rows", subset=["onset"]) df_list.append(add_events) @staticmethod def _add_durations(df, add_events, duration_sources): add_events["duration"] = 0 for duration in duration_sources: if isinstance(duration, float) or isinstance(duration, int): add_events["duration"] = add_events["duration"].add(duration) elif isinstance(duration, str) and duration in list(df.columns): add_events["duration"] = add_events["duration"].add(pd.to_numeric(df[duration], errors="coerce")) else: raise TypeError( "BadDurationInModel", f"Remodeling duration {str(duration)} must either be numeric or a column name", "" ) @staticmethod def _create_onsets(df, onset_source): """Create a vector of onsets for the new events. Parameters: df (DataFrame): The dataframe to process. onset_source (list): List of onsets of process. Returns: list: list of same length as df with the onsets. :raises HedFileError: - If one of the onset specifiers is invalid. """ onsets = pd.to_numeric(df["onset"], errors="coerce") for onset in onset_source: if isinstance(onset, float) or isinstance(onset, int): onsets = onsets + onset elif isinstance(onset, str) and onset in list(df.columns): onsets = onsets.add(pd.to_numeric(df[onset], errors="coerce")) else: raise TypeError( "BadOnsetInModel", f"Remodeling onset {str(onset)} must either be numeric or a column name.", "" ) return onsets
[docs] @staticmethod def validate_input_data(parameters): """Additional validation required of operation parameters not performed by JSON schema validator.""" return []