Tools¶

Utility tools and scripts for working with HED data.

Analysis Tools¶

TabularSummary¶

TabularSummary ¶

Summarize the contents of columnar files.

Source code in hed/tools/analysis/tabular_summary.py

class TabularSummary:
    """ Summarize the contents of columnar files. """

    def __init__(self, value_cols=None, skip_cols=None, name=''):
        """ Constructor for a BIDS tabular file summary.

        Parameters:
            value_cols (list, None):  List of columns to be treated as value columns.
            skip_cols (list, None):   List of columns to be skipped.
            name (str):               Name associated with the dictionary.

        """

        self.name = name
        self.categorical_info = {}
        self.value_info = {}
        if value_cols and skip_cols and set(value_cols).intersection(skip_cols):
            raise HedFileError("ValueSkipOverlap",
                               f"Value columns {str(value_cols)} and skip columns {str(skip_cols)} cannot overlap", "")
        if value_cols:
            for value in value_cols:
                self.value_info[value] = [0, 0]
        if skip_cols:
            self.skip_cols = skip_cols.copy()
        else:
            self.skip_cols = []
        self.total_files = 0
        self.total_events = 0
        self.files = {}

    def __str__(self):
        """ Return a str version of this summary.
        """
        indent = "   "
        summary_list = [f"Summary for column dictionary {self.name}:"]
        sorted_keys = sorted(self.categorical_info.keys())
        summary_list.append(f"{indent}Categorical columns ({len(sorted_keys)}):")
        for key in sorted_keys:
            value_dict = self.categorical_info[key]
            sorted_v_keys = sorted(list(value_dict))
            summary_list.append(f"{indent * 2}{key} ({len(sorted_v_keys)} distinct values):")
            for v_key in sorted_v_keys:
                summary_list.append(f"{indent * 3}{v_key}: {value_dict[v_key]}")

        sorted_cols = sorted(map(str, list(self.value_info)))
        summary_list.append(f"{indent}Value columns ({len(sorted_cols)}):")
        for key in sorted_cols:
            summary_list.append(f"{indent * 2}{key}: {self.value_info[key]}")
        return "\n".join(summary_list)

    def extract_sidecar_template(self) -> dict:
        """ Extract a BIDS sidecar-compatible dictionary.

        Returns:
            dict: A sidecar template that can be converted to JSON.

        """
        side_dict = {}
        for column_name, columns in self.categorical_info.items():
            column_values = list(columns.keys())
            column_values.sort()
            side_dict[column_name] = annotation_util.generate_sidecar_entry(column_name, column_values)

        for column_name in self.value_info.keys():
            side_dict[column_name] = annotation_util.generate_sidecar_entry(column_name, [])
        return side_dict

    def get_summary(self, as_json=False) -> Union[dict, str]:
        """ Return the summary in dictionary format.

        Parameters:
            as_json (bool): If False, return as a Python dictionary, otherwise convert to a JSON dictionary.

        Returns:
            Union[dict, str]: A dictionary containing the summary information or a JSON string if as_json is True.
        """
        sorted_keys = sorted(self.categorical_info.keys())
        categorical_cols = {}
        for key in sorted_keys:
            cat_dict = self.categorical_info[key]
            sorted_v_keys = sorted(list(cat_dict))
            val_dict = {}
            for v_key in sorted_v_keys:
                val_dict[v_key] = cat_dict[v_key]
            categorical_cols[key] = val_dict
        sorted_cols = sorted(map(str, list(self.value_info)))
        value_cols = {}
        for key in sorted_cols:
            value_cols[key] = self.value_info[key]
        summary = {"Name": self.name, "Total events": self.total_events, "Total files": self.total_files,
                   "Categorical columns": categorical_cols, "Value columns": value_cols,
                   "Skip columns": self.skip_cols, "Files": self.files}
        if as_json:
            return json.dumps(summary, indent=4)
        else:
            return summary

    def get_number_unique(self, column_names=None) -> dict:
        """ Return the number of unique values in columns.

        Parameters:
            column_names (list, None):   A list of column names to analyze or all columns if None.

        Returns:
            dict: Column names are the keys and the number of unique values in the column are the values.

        """
        if not column_names:
            column_names = list(self.categorical_info.keys())
        counts = {}
        for column_name in column_names:
            if column_name not in self.categorical_info:
                counts[column_name] = 'n/a'
            else:
                counts[column_name] = len(self.categorical_info[column_name].keys())
        return counts

    def update(self, data, name=None):
        """ Update the counts based on data.

        Parameters:
            data (DataFrame, str, or list):    DataFrame containing data to update.
            name (str): Name of the summary.

        """

        if isinstance(data, list):
            for filename in data:
                self._update_dataframe(filename, filename)
        elif isinstance(data, str):
            self._update_dataframe(data, data)
        else:
            self._update_dataframe(data, name)

    def update_summary(self, tab_sum):
        """ Add TabularSummary values to this object.

        Parameters:
            tab_sum (TabularSummary):   A TabularSummary to be combined.

        Notes:
            - The value_cols and skip_cols are updated as long as they are not contradictory.
            - A new skip column cannot be used.

        """
        self.total_files = self.total_files + tab_sum.total_files
        self.total_events = self.total_events + tab_sum.total_events
        for file, key in tab_sum.files.items():
            self.files[file] = ''
        self._update_dict_skip(tab_sum)
        self._update_dict_value(tab_sum)
        self._update_dict_categorical(tab_sum)

    def _update_categorical(self, tab_name, values):
        """ Update the categorical information for this summary.

        Parameters:
            tab_name (str): Name of a key indicating a categorical column.
            values (dict): A dictionary whose keys are unique categorical values.

        """
        if tab_name not in self.categorical_info:
            self.categorical_info[tab_name] = {}

        total_values = self.categorical_info[tab_name]
        for name, value in values.items():
            value_list = total_values.get(name, [0, 0])
            if not isinstance(value, list):
                value = [value, 1]
            total_values[name] = [value_list[0] + value[0], value_list[1] + value[1]]

    def _update_dataframe(self, data, name):
        """ Update the information based on columnar data.

        Parameters:
            data (DataFrame, str):  Columnar data (either DataFrame or filename) whose columns are to be summarized.
            name (str): Name of the file corresponding to data.

        """
        df = data_util.get_new_dataframe(data)
        if name:
            self.files[name] = ""
        self.total_files = self.total_files + 1
        self.total_events = self.total_events + len(df.index)
        for col_name, col_values in df.items():
            if self.skip_cols and col_name in self.skip_cols:
                continue
            if col_name in self.value_info.keys():
                self.value_info[col_name][0] = self.value_info[col_name][0] + len(col_values)
                self.value_info[col_name][1] = self.value_info[col_name][1] + 1
            else:
                col_values = col_values.astype(str)
                values = col_values.value_counts(ascending=True)
                self._update_categorical(col_name,  values)

    def _update_dict_categorical(self, col_dict):
        """ Update this summary with the categorical information in the dictionary from another summary.

        Parameters:
            col_dict (TabularSummary):  Summary information from another tabular summary.

        """
        new_cat_cols = col_dict.categorical_info.keys()
        if not new_cat_cols:
            return
        val_cols = self.value_info.keys()
        for col in new_cat_cols:
            if col in val_cols:
                raise HedFileError("CatColShouldBeValueCol",
                                   f"Categorical column [{str(col)}] is already a value column", "")
            elif col in self.skip_cols:
                continue
            else:
                self._update_categorical(col, col_dict.categorical_info[col])

    def _update_dict_skip(self, col_dict):
        """ Update this summary with the skip column information from another summary.

        Parameters:
            col_dict (TabularSummary):  Summary information from another tabular summary.

        """

        if not col_dict.skip_cols:
            return
        cat_cols = self.categorical_info.keys()
        val_cols = self.value_info.keys()
        for col in col_dict.skip_cols:
            if col in cat_cols or col in val_cols:
                raise HedFileError("SkipColInvalid",
                                   f"Skip column [{str(col)}] is already a categorical or value column", "")
            elif col not in self.skip_cols:
                self.skip_cols.append(col)

    def _update_dict_value(self, col_dict):
        """ Update this summary with the value column information from another summary.

        Parameters:
             col_dict (TabularSummary):  Summary information from another tabular summary.

        """
        new_value_cols = col_dict.value_info.keys()
        if not new_value_cols:
            return
        cat_cols = self.categorical_info.keys()
        val_cols = self.value_info.keys()
        for col in new_value_cols:
            if col in cat_cols:
                raise HedFileError("ValueColIsCatCol", f"Value column [{str(col)}] is already a categorical column", "")
            elif col in self.skip_cols:
                continue
            elif col not in val_cols:
                self.value_info[col] = col_dict.value_info[col]
            else:
                self.value_info[col] = [self.value_info[col][0] + col_dict.value_info[col][0],
                                        self.value_info[col][1] + col_dict.value_info[col][1]]

    @staticmethod
    def extract_summary(summary_info) -> 'TabularSummary':
        """ Create a TabularSummary object from a serialized summary.

        Parameters:
            summary_info (dict or str):  A JSON string or a dictionary containing contents of a TabularSummary.

        Returns:
            TabularSummary:  contains the information in summary_info as a TabularSummary object.
        """

        if isinstance(summary_info, str):
            summary_info = json.loads(summary_info)
        new_tab = TabularSummary(value_cols=summary_info.get('Value columns', {}).keys(),
                                 skip_cols=summary_info.get('Skip columns', []),
                                 name=summary_info.get('Summary name', ''))
        new_tab.value_info = summary_info.get('Value_columns', {})
        new_tab.total_files = summary_info.get('Total files', 0)
        new_tab.total_events = summary_info.get('Total events', 0)
        new_tab.skip_cols = summary_info.get('Skip columns', [])
        new_tab.categorical_info = summary_info.get('Categorical columns', {})
        new_tab.files = summary_info.get('Files', {})
        return new_tab

    @staticmethod
    def get_columns_info(dataframe, skip_cols=None) -> dict[str, dict]:
        """ Extract unique value counts for columns.

        Parameters:
            dataframe (DataFrame): The DataFrame to be analyzed.
            skip_cols (list): List of names of columns to be skipped in the extraction.

        Returns:
            dict[str, dict]: A dictionary with keys that are column names (strings) and values that
                           are dictionaries of unique value counts.

        """
        col_info = dict()

        for col_name, col_values in dataframe.items():
            if skip_cols and col_name in skip_cols:
                continue
            col_info[col_name] = col_values.value_counts(ascending=True).to_dict()
        return col_info

    @staticmethod
    def make_combined_dicts(file_dictionary, skip_cols=None) -> tuple['TabularSummary', dict[str, 'TabularSummary']]:
        """ Return combined and individual summaries.

        Parameters:
            file_dictionary (FileDictionary): Dictionary of file name keys and full path.
            skip_cols (list): Name of the column.

        Returns:
            tuple:
                - TabularSummary: A combined summary of all files in the dictionary.
                - dict[str, TabularSummary]: A dictionary where keys are file names and values are individual TabularSummary objects.

        """

        summary_all = TabularSummary(skip_cols=skip_cols)
        summary_dict = {}
        for key, file_path in file_dictionary.items():
            orig_dict = TabularSummary(skip_cols=skip_cols)
            df = data_util.get_new_dataframe(file_path)
            orig_dict.update(df)
            summary_dict[key] = orig_dict
            summary_all.update_summary(orig_dict)
        return summary_all, summary_dict

extract_sidecar_template ¶

extract_sidecar_template() -> dict

Extract a BIDS sidecar-compatible dictionary.

Returns:

Name	Type	Description
`dict`	`dict`	A sidecar template that can be converted to JSON.

Source code in hed/tools/analysis/tabular_summary.py

def extract_sidecar_template(self) -> dict:
    """ Extract a BIDS sidecar-compatible dictionary.

    Returns:
        dict: A sidecar template that can be converted to JSON.

    """
    side_dict = {}
    for column_name, columns in self.categorical_info.items():
        column_values = list(columns.keys())
        column_values.sort()
        side_dict[column_name] = annotation_util.generate_sidecar_entry(column_name, column_values)

    for column_name in self.value_info.keys():
        side_dict[column_name] = annotation_util.generate_sidecar_entry(column_name, [])
    return side_dict

extract_summary `staticmethod` ¶

extract_summary(summary_info) -> TabularSummary

Create a TabularSummary object from a serialized summary.

Parameters:

Name	Type	Description	Default
`summary_info`	`dict or str`	A JSON string or a dictionary containing contents of a TabularSummary.	required

Returns:

Name	Type	Description
`TabularSummary`	`TabularSummary`	contains the information in summary_info as a TabularSummary object.

Source code in hed/tools/analysis/tabular_summary.py

@staticmethod
def extract_summary(summary_info) -> 'TabularSummary':
    """ Create a TabularSummary object from a serialized summary.

    Parameters:
        summary_info (dict or str):  A JSON string or a dictionary containing contents of a TabularSummary.

    Returns:
        TabularSummary:  contains the information in summary_info as a TabularSummary object.
    """

    if isinstance(summary_info, str):
        summary_info = json.loads(summary_info)
    new_tab = TabularSummary(value_cols=summary_info.get('Value columns', {}).keys(),
                             skip_cols=summary_info.get('Skip columns', []),
                             name=summary_info.get('Summary name', ''))
    new_tab.value_info = summary_info.get('Value_columns', {})
    new_tab.total_files = summary_info.get('Total files', 0)
    new_tab.total_events = summary_info.get('Total events', 0)
    new_tab.skip_cols = summary_info.get('Skip columns', [])
    new_tab.categorical_info = summary_info.get('Categorical columns', {})
    new_tab.files = summary_info.get('Files', {})
    return new_tab

get_columns_info `staticmethod` ¶

get_columns_info(
    dataframe, skip_cols=None
) -> dict[str, dict]

Extract unique value counts for columns.

Parameters:

Name	Type	Description	Default
`dataframe`	`DataFrame`	The DataFrame to be analyzed.	required
`skip_cols`	`list`	List of names of columns to be skipped in the extraction.	`None`

Returns:

Type	Description
`dict[str, dict]`	dict[str, dict]: A dictionary with keys that are column names (strings) and values that are dictionaries of unique value counts.

Source code in hed/tools/analysis/tabular_summary.py

@staticmethod
def get_columns_info(dataframe, skip_cols=None) -> dict[str, dict]:
    """ Extract unique value counts for columns.

    Parameters:
        dataframe (DataFrame): The DataFrame to be analyzed.
        skip_cols (list): List of names of columns to be skipped in the extraction.

    Returns:
        dict[str, dict]: A dictionary with keys that are column names (strings) and values that
                       are dictionaries of unique value counts.

    """
    col_info = dict()

    for col_name, col_values in dataframe.items():
        if skip_cols and col_name in skip_cols:
            continue
        col_info[col_name] = col_values.value_counts(ascending=True).to_dict()
    return col_info

get_number_unique ¶

get_number_unique(column_names=None) -> dict

Return the number of unique values in columns.

Parameters:

Name	Type	Description	Default
`column_names`	`(list, None)`	A list of column names to analyze or all columns if None.	`None`

Returns:

Name	Type	Description
`dict`	`dict`	Column names are the keys and the number of unique values in the column are the values.

Source code in hed/tools/analysis/tabular_summary.py

def get_number_unique(self, column_names=None) -> dict:
    """ Return the number of unique values in columns.

    Parameters:
        column_names (list, None):   A list of column names to analyze or all columns if None.

    Returns:
        dict: Column names are the keys and the number of unique values in the column are the values.

    """
    if not column_names:
        column_names = list(self.categorical_info.keys())
    counts = {}
    for column_name in column_names:
        if column_name not in self.categorical_info:
            counts[column_name] = 'n/a'
        else:
            counts[column_name] = len(self.categorical_info[column_name].keys())
    return counts

get_summary ¶

get_summary(as_json=False) -> Union[dict, str]

Return the summary in dictionary format.

Parameters:

Name	Type	Description	Default
`as_json`	`bool`	If False, return as a Python dictionary, otherwise convert to a JSON dictionary.	`False`

Returns:

Type	Description
`Union[dict, str]`	Union[dict, str]: A dictionary containing the summary information or a JSON string if as_json is True.

Source code in hed/tools/analysis/tabular_summary.py

def get_summary(self, as_json=False) -> Union[dict, str]:
    """ Return the summary in dictionary format.

    Parameters:
        as_json (bool): If False, return as a Python dictionary, otherwise convert to a JSON dictionary.

    Returns:
        Union[dict, str]: A dictionary containing the summary information or a JSON string if as_json is True.
    """
    sorted_keys = sorted(self.categorical_info.keys())
    categorical_cols = {}
    for key in sorted_keys:
        cat_dict = self.categorical_info[key]
        sorted_v_keys = sorted(list(cat_dict))
        val_dict = {}
        for v_key in sorted_v_keys:
            val_dict[v_key] = cat_dict[v_key]
        categorical_cols[key] = val_dict
    sorted_cols = sorted(map(str, list(self.value_info)))
    value_cols = {}
    for key in sorted_cols:
        value_cols[key] = self.value_info[key]
    summary = {"Name": self.name, "Total events": self.total_events, "Total files": self.total_files,
               "Categorical columns": categorical_cols, "Value columns": value_cols,
               "Skip columns": self.skip_cols, "Files": self.files}
    if as_json:
        return json.dumps(summary, indent=4)
    else:
        return summary

make_combined_dicts `staticmethod` ¶

make_combined_dicts(
    file_dictionary, skip_cols=None
) -> tuple[TabularSummary, dict[str, TabularSummary]]

Return combined and individual summaries.

Parameters:

Name	Type	Description	Default
`file_dictionary`	`FileDictionary`	Dictionary of file name keys and full path.	required
`skip_cols`	`list`	Name of the column.	`None`

Returns:

Name	Type	Description
`tuple`	`tuple[TabularSummary, dict[str, TabularSummary]]`	TabularSummary: A combined summary of all files in the dictionary. dict[str, TabularSummary]: A dictionary where keys are file names and values are individual TabularSummary objects.

Source code in hed/tools/analysis/tabular_summary.py

@staticmethod
def make_combined_dicts(file_dictionary, skip_cols=None) -> tuple['TabularSummary', dict[str, 'TabularSummary']]:
    """ Return combined and individual summaries.

    Parameters:
        file_dictionary (FileDictionary): Dictionary of file name keys and full path.
        skip_cols (list): Name of the column.

    Returns:
        tuple:
            - TabularSummary: A combined summary of all files in the dictionary.
            - dict[str, TabularSummary]: A dictionary where keys are file names and values are individual TabularSummary objects.

    """

    summary_all = TabularSummary(skip_cols=skip_cols)
    summary_dict = {}
    for key, file_path in file_dictionary.items():
        orig_dict = TabularSummary(skip_cols=skip_cols)
        df = data_util.get_new_dataframe(file_path)
        orig_dict.update(df)
        summary_dict[key] = orig_dict
        summary_all.update_summary(orig_dict)
    return summary_all, summary_dict

update ¶

update(data, name=None)

Update the counts based on data.

Parameters:

Name	Type	Description	Default
`data`	`DataFrame, str, or list`	DataFrame containing data to update.	required
`name`	`str`	Name of the summary.	`None`

Source code in hed/tools/analysis/tabular_summary.py

def update(self, data, name=None):
    """ Update the counts based on data.

    Parameters:
        data (DataFrame, str, or list):    DataFrame containing data to update.
        name (str): Name of the summary.

    """

    if isinstance(data, list):
        for filename in data:
            self._update_dataframe(filename, filename)
    elif isinstance(data, str):
        self._update_dataframe(data, data)
    else:
        self._update_dataframe(data, name)

update_summary ¶

update_summary(tab_sum)

Add TabularSummary values to this object.

Parameters:

Name	Type	Description	Default
`tab_sum`	`TabularSummary`	A TabularSummary to be combined.	required

Notes

The value_cols and skip_cols are updated as long as they are not contradictory.
A new skip column cannot be used.

Source code in hed/tools/analysis/tabular_summary.py

def update_summary(self, tab_sum):
    """ Add TabularSummary values to this object.

    Parameters:
        tab_sum (TabularSummary):   A TabularSummary to be combined.

    Notes:
        - The value_cols and skip_cols are updated as long as they are not contradictory.
        - A new skip column cannot be used.

    """
    self.total_files = self.total_files + tab_sum.total_files
    self.total_events = self.total_events + tab_sum.total_events
    for file, key in tab_sum.files.items():
        self.files[file] = ''
    self._update_dict_skip(tab_sum)
    self._update_dict_value(tab_sum)
    self._update_dict_categorical(tab_sum)

Annotation Utilities¶

annotation_util ¶

Utilities to facilitate annotation of events in BIDS.

check_df_columns ¶

check_df_columns(
    df,
    required_cols=(
        "column_name",
        "column_value",
        "description",
        "HED",
    ),
) -> list[str]

Return a list of the specified columns that are missing from a dataframe.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Spreadsheet to check the columns of.	required
`required_cols`	`tuple`	List of column names that must be present.	`('column_name', 'column_value', 'description', 'HED')`

Returns:

Type	Description
`list[str]`	list[str]: List of column names that are missing.

Source code in hed/tools/analysis/annotation_util.py

def check_df_columns(df, required_cols=('column_name', 'column_value', 'description', 'HED')) -> list[str]:
    """ Return a list of the specified columns that are missing from a dataframe.

    Parameters:
        df (DataFrame):  Spreadsheet to check the columns of.
        required_cols (tuple):  List of column names that must be present.

    Returns:
        list[str]:   List of column names that are missing.

    """
    missing_cols = []
    column_list = list(df.columns.values)
    for col in required_cols:
        if col not in column_list:
            missing_cols.append(col)
    return missing_cols

df_to_hed ¶

df_to_hed(dataframe, description_tag=True) -> dict

Create sidecar-like dictionary from a 4-column dataframe.

Parameters:

Name	Type	Description	Default
`dataframe`	`DataFrame`	A four-column Pandas DataFrame with specific columns.	required
`description_tag`	`bool`	If True description tag is included.	`True`

Returns:

Name	Type	Description
`dict`	`dict`	A dictionary compatible with BIDS JSON tabular file that includes HED.

Notes

The DataFrame must have the columns with names: column_name, column_value, description, and HED.

Source code in hed/tools/analysis/annotation_util.py

def df_to_hed(dataframe, description_tag=True) -> dict:
    """ Create sidecar-like dictionary from a 4-column dataframe.

    Parameters:
        dataframe (DataFrame):   A four-column Pandas DataFrame with specific columns.
        description_tag (bool):  If True description tag is included.

    Returns:
        dict:  A dictionary compatible with BIDS JSON tabular file that includes HED.

    Notes:
        - The DataFrame must have the columns with names: column_name, column_value, description, and HED.

    """
    df = dataframe.fillna('n/a')
    missing_cols = check_df_columns(df)
    if missing_cols:
        raise HedFileError("RequiredColumnsMissing", f"Columns {str(missing_cols)} are missing from dataframe", "")
    hed_dict = {}
    for index, row in df.iterrows():
        if row['HED'] == 'n/a' and row['description'] == 'n/a':
            continue
        if row['column_value'] == 'n/a':
            hed_dict[row['column_name']] = _get_value_entry(row['HED'], row['description'],
                                                            description_tag=description_tag)
            continue
        cat_dict = hed_dict.get(row['column_name'], {})
        _update_cat_dict(cat_dict, row['column_value'], row['HED'], row['description'],
                         description_tag=description_tag)
        hed_dict[row['column_name']] = cat_dict
    return hed_dict

extract_tags ¶

extract_tags(
    hed_string, search_tag
) -> tuple[str, list[str]]

Extract all instances of specified tag from a tag_string.

Parameters:

Name	Type	Description	Default
`hed_string`	`str`	Tag string from which to extract tag.	required
`search_tag`	`str`	HED tag to extract.	required

Returns:

Type	Description
`tuple[str, list[str]]`	tuple[str, list[str] - Tag string without the tags. - A list of the tags that were extracted, for example descriptions.

Source code in hed/tools/analysis/annotation_util.py

def extract_tags(hed_string, search_tag) -> tuple[str, list[str]]:
    """ Extract all instances of specified tag from a tag_string.

        Parameters:
           hed_string (str):   Tag string from which to extract tag.
           search_tag (str):   HED tag to extract.

        Returns:
            tuple[str, list[str]
                - Tag string without the tags.
                - A list of the tags that were extracted, for example descriptions.

    """
    possible_descriptions = hed_string.replace(")", "").replace("(", "").split(",")
    extracted = [tag.strip() for tag in possible_descriptions if search_tag in tag]
    remainder = hed_string
    for tag in extracted:
        remainder = df_util.replace_ref(remainder, tag)

    return remainder, extracted

generate_sidecar_entry ¶

generate_sidecar_entry(
    column_name, column_values=None
) -> dict

Create a sidecar column dictionary for column.

Parameters:

Name	Type	Description	Default
`column_name`	`str`	Name of the column.	required
`column_values`	`list`	List of column values.	`None`

Returns: dict: A dictionary representing a template for a sidecar entry.

Source code in hed/tools/analysis/annotation_util.py

def generate_sidecar_entry(column_name, column_values=None) -> dict:
    """ Create a sidecar column dictionary for column.

    Parameters:
        column_name (str):       Name of the column.
        column_values (list):    List of column values.

     Returns:
         dict:   A dictionary representing a template for a sidecar entry.

    """

    name_label = re.sub(r'[^A-Za-z0-9-]+', '_', column_name)
    sidecar_entry = {"Description": f"Description for {column_name}", "HED": ""}
    if not column_values:
        sidecar_entry["HED"] = f"(Label/{name_label}, ID/#)"
    else:
        levels = {}
        hed = {}
        for column_value in column_values:
            if column_value == "n/a":
                continue
            value_label = re.sub(r'[^A-Za-z0-9-]+', '_', column_value)
            levels[column_value] = f"Here describe column value {column_value} of column {column_name}"
            hed[column_value] = f"(Label/{name_label}, ID/{value_label})"
        sidecar_entry["Levels"] = levels
        sidecar_entry["HED"] = hed
    return sidecar_entry

hed_to_df ¶

hed_to_df(sidecar_dict, col_names=None) -> DataFrame

Return a 4-column dataframe of HED portions of sidecar.

Parameters:

Name	Type	Description	Default
`sidecar_dict`	`dict`	A dictionary conforming to BIDS JSON events sidecar format.	required
`col_names`	`(list, None)`	A list of the cols to include in the flattened sidecar.	`None`

Returns:

Name	Type	Description
`DataFrame`	`DataFrame`	Four-column spreadsheet representing HED portion of sidecar.

Notes

The returned DataFrame has columns: column_name, column_value, description, and HED.

Source code in hed/tools/analysis/annotation_util.py

def hed_to_df(sidecar_dict, col_names=None) -> DataFrame:
    """ Return a 4-column dataframe of HED portions of sidecar.

    Parameters:
        sidecar_dict (dict):      A dictionary conforming to BIDS JSON events sidecar format.
        col_names (list, None):   A list of the cols to include in the flattened sidecar.

    Returns:
        DataFrame:  Four-column spreadsheet representing HED portion of sidecar.

    Notes:
        - The returned DataFrame has columns: column_name, column_value, description, and HED.

    """

    if not col_names:
        col_names = sidecar_dict.keys()
    column_name = []
    column_value = []
    column_description = []
    hed_tags = []

    for col_key, col_dict in sidecar_dict.items():
        if col_key not in col_names or not isinstance(col_dict, dict) or 'HED' not in col_dict:
            continue
        elif 'Levels' in col_dict or isinstance(col_dict['HED'], dict):
            keys, values, descriptions, tags = _flatten_cat_col(col_key, col_dict)
        else:
            keys, values, descriptions, tags = _flatten_val_col(col_key, col_dict)
        column_name = column_name + keys
        column_value = column_value + values
        column_description = column_description + descriptions
        hed_tags = hed_tags + tags

    data = {"column_name": column_name, "column_value": column_value,
            "description": column_description, "HED": hed_tags}
    dataframe = pd.DataFrame(data).astype(str)
    return dataframe

merge_hed_dict ¶

merge_hed_dict(sidecar_dict, hed_dict)

Update a JSON sidecar based on the hed_dict values.

Parameters:

Name	Type	Description	Default
`sidecar_dict`	`dict`	Dictionary representation of a BIDS JSON sidecar.	required
`hed_dict`	`dict`	Dictionary derived from a dataframe representation of HED in sidecar.	required

Source code in hed/tools/analysis/annotation_util.py

def merge_hed_dict(sidecar_dict, hed_dict):
    """ Update a JSON sidecar based on the hed_dict values.

    Parameters:
        sidecar_dict (dict):  Dictionary representation of a BIDS JSON sidecar.
        hed_dict (dict):       Dictionary derived from a dataframe representation of HED in sidecar.

    """

    for key, value_dict in hed_dict.items():
        if key not in sidecar_dict:
            sidecar_dict[key] = value_dict
            continue
        sidecar_dict[key]['HED'] = value_dict['HED']
        if isinstance(value_dict['HED'], str) and value_dict.get('Description', "n/a") != "n/a":
            sidecar_dict[key]['Description'] = value_dict['Description']
            continue
        if isinstance(value_dict['HED'], dict) and 'Levels' in value_dict:
            sidecar_dict[key]['Levels'] = value_dict['Levels']

series_to_factor ¶

series_to_factor(series) -> list[int]

Convert a series to an integer factor list.

Parameters:

Name	Type	Description	Default
`series`	`Series`	Series to be converted to a list.	required

Returns:

Type	Description
`list[int]`	list[int] - contains 0's and 1's, empty, 'n/a' and np.nan are converted to 0.

Source code in hed/tools/analysis/annotation_util.py

def series_to_factor(series) -> list[int]:
    """Convert a series to an integer factor list.

    Parameters:
        series (pd.Series): Series to be converted to a list.

    Returns:
        list[int] - contains 0's and 1's, empty, 'n/a' and np.nan are converted to 0.
    """
    replaced = series.replace('n/a', False)
    filled = replaced.fillna(False)
    bool_list = filled.astype(bool).tolist()
    return [int(value) for value in bool_list]

str_to_tabular ¶

str_to_tabular(tsv_str, sidecar=None) -> TabularInput

Return a TabularInput a tsv string.

Parameters:

Name	Type	Description	Default
`tsv_str`	`str`	A string representing a tabular input.	required
`sidecar`	`(Sidecar, str, File or File - like)`	An optional Sidecar object.	`None`

Returns: TabularInput: Represents a tabular input object.

Source code in hed/tools/analysis/annotation_util.py

def str_to_tabular(tsv_str, sidecar=None) -> TabularInput:
    """ Return a TabularInput a tsv string.

    Parameters:
        tsv_str (str):  A string representing a tabular input.
        sidecar (Sidecar, str, File or File-like): An optional Sidecar object.

     Returns:
         TabularInput:  Represents a tabular input object.
     """

    return TabularInput(file=io.StringIO(tsv_str), sidecar=sidecar)

strs_to_hed_objs ¶

strs_to_hed_objs(
    hed_strings, hed_schema
) -> Union[list[HedString], None]

Returns a list of HedString objects from a list of strings.

Parameters:

Name	Type	Description	Default
`hed_strings`	`string or list`	String or strings representing HED annotations.	required
`hed_schema`	`HedSchema or HedSchemaGroup`	Schema version for the strings.	required

Returns:

Type	Description
`Union[list[HedString], None]`	Union[list[HedString], None]: A list of HedString objects or None.

Source code in hed/tools/analysis/annotation_util.py

def strs_to_hed_objs(hed_strings, hed_schema) -> Union[list[HedString], None]:
    """ Returns a list of HedString objects from a list of strings.

     Parameters:
         hed_strings (string or list):  String or strings representing HED annotations.
         hed_schema (HedSchema or HedSchemaGroup): Schema version for the strings.

     Returns:
         Union[list[HedString], None]:  A list of HedString objects or None.

     """
    if not hed_strings:
        return None
    if not isinstance(hed_strings, list):
        hed_strings = [hed_strings]
    if hed_strings:
        return [HedString(hed, hed_schema=hed_schema) for hed in hed_strings]
    else:
        return None

strs_to_sidecar ¶

strs_to_sidecar(sidecar_strings) -> Union[Sidecar, None]

Return a Sidecar from a sidecar as string or as a list of sidecars as strings.

Parameters:

Name	Type	Description	Default
`sidecar_strings`	`string or list`	String or strings representing sidecars.	required

Returns:

Type	Description
`Union[Sidecar, None]`	Union[Sidecar, None]: the merged sidecar from the list.

Source code in hed/tools/analysis/annotation_util.py

def strs_to_sidecar(sidecar_strings) -> Union[Sidecar, None]:
    """ Return a Sidecar from a sidecar as string or as a list of sidecars as strings.

     Parameters:
         sidecar_strings (string or list):  String or strings representing sidecars.

     Returns:
         Union[Sidecar, None]:  the merged sidecar from the list.
     """

    if not sidecar_strings:
        return None
    if not isinstance(sidecar_strings, list):
        sidecar_strings = [sidecar_strings]
    if sidecar_strings:
        file_list = []
        for s_string in sidecar_strings:
            file_list.append(io.StringIO(s_string))
        return Sidecar(files=file_list, name="Merged_Sidecar")
    else:
        return None

to_factor ¶

to_factor(data, column=None) -> list[int]

Convert data to an integer factor list.

Parameters:

Name	Type	Description	Default
`data`	`Series or DataFrame`	Series or DataFrame to be converted to a list.	required
`column`	`str`	Column name if DataFrame, otherwise column 0 is used.	`None`

Returns:

Type	Description
`list[int]`	list[int]: A list containing 0's and 1's. Empty, 'n/a', and np.nan values are converted to 0.

Source code in hed/tools/analysis/annotation_util.py

def to_factor(data, column=None) -> list[int]:
    """Convert data to an integer factor list.

    Parameters:
        data (Series or DataFrame): Series or DataFrame to be converted to a list.
        column (str, optional): Column name if DataFrame, otherwise column 0 is used.

    Returns:
        list[int]: A list containing 0's and 1's. Empty, 'n/a', and np.nan values are converted to 0.

    """
    if isinstance(data, Series):
        series = data
    elif isinstance(data, DataFrame) and column:
        series = data[column]
    elif isinstance(data, DataFrame):
        series = data.iloc[:, 0]
    else:
        raise HedFileError("CannotConvertToFactor",
                           f"Expecting Series or DataFrame but got {type(data)}", "")

    replaced = series.replace('n/a', False)
    filled = replaced.fillna(False)
    bool_list = filled.astype(bool).tolist()
    return [int(value) for value in bool_list]

to_strlist ¶

to_strlist(obj_list) -> list[str]

Convert objects in a list to strings, preserving None values.

Parameters:

Name	Type	Description	Default
`obj_list`	`list`	A list of objects that are None or have a str method.	required

Returns:

Type	Description
`list[str]`	list[str]: A list with the objects converted to strings. None values are preserved as empty strings.

Source code in hed/tools/analysis/annotation_util.py

def to_strlist(obj_list) -> list[str]:
    """Convert objects in a list to strings, preserving None values.

    Parameters:
        obj_list (list): A list of objects that are None or have a str method.

    Returns:
        list[str]: A list with the objects converted to strings. None values are preserved as empty strings.

    """
    # Using list comprehension to convert non-None items to strings
    return [str(item) if item is not None else '' for item in obj_list]

Remodeling Operations¶

Base Operations¶

base_op ¶

Base class for remodeling operations.

BaseOp ¶

Bases: ABC

Base class for operations. All remodeling operations should extend this class.

Source code in hed/tools/remodeling/operations/base_op.py

class BaseOp(ABC):
    """ Base class for operations. All remodeling operations should extend this class."""

    def __init__(self, parameters):
        """ Constructor for the BaseOp class. Should be extended by operations.

        Parameters:
            parameters (dict): A dictionary specifying the appropriate parameters for the operation.
        """
        self.parameters = parameters

    @property
    @abstractmethod
    def NAME(self):
        pass

    @property
    @abstractmethod
    def PARAMS(self):
        pass

    @abstractmethod
    def do_op(self, dispatcher, df, name, sidecar=None):
        """ Base class method to be overridden by each operation.

        Parameters:
            dispatcher (Dispatcher): Manages the operation I/O.
            df (DataFrame): The tabular file to be remodeled.
            name (str): Unique identifier for the data -- often the original file path.
            sidecar (Sidecar or file-like):  A JSON sidecar needed for HED operations.

        """

        return df.copy()

    @staticmethod
    @abstractmethod
    def validate_input_data(parameters):
        """ Validates whether operation parameters meet op-specific criteria beyond that captured in json schema.

        Example: A check to see whether two input arrays are the same length.

        Notes: The minimum implementation should return an empty list to indicate no errors were found.
               If additional validation is necessary, method should perform the validation and
               return a list with user-friendly error strings.
        """
        return []

do_op `abstractmethod` ¶

do_op(dispatcher, df, name, sidecar=None)

Base class method to be overridden by each operation.

Parameters:

Name	Type	Description	Default
`dispatcher`	`Dispatcher`	Manages the operation I/O.	required
`df`	`DataFrame`	The tabular file to be remodeled.	required
`name`	`str`	Unique identifier for the data -- often the original file path.	required
`sidecar`	`Sidecar or file - like`	A JSON sidecar needed for HED operations.	`None`

Source code in hed/tools/remodeling/operations/base_op.py

@abstractmethod
def do_op(self, dispatcher, df, name, sidecar=None):
    """ Base class method to be overridden by each operation.

    Parameters:
        dispatcher (Dispatcher): Manages the operation I/O.
        df (DataFrame): The tabular file to be remodeled.
        name (str): Unique identifier for the data -- often the original file path.
        sidecar (Sidecar or file-like):  A JSON sidecar needed for HED operations.

    """

    return df.copy()

validate_input_data `abstractmethod` `staticmethod` ¶

validate_input_data(parameters)

Validates whether operation parameters meet op-specific criteria beyond that captured in json schema.

Example: A check to see whether two input arrays are the same length.

The minimum implementation should return an empty list to indicate no errors were found.

If additional validation is necessary, method should perform the validation and return a list with user-friendly error strings.

Source code in hed/tools/remodeling/operations/base_op.py

@staticmethod
@abstractmethod
def validate_input_data(parameters):
    """ Validates whether operation parameters meet op-specific criteria beyond that captured in json schema.

    Example: A check to see whether two input arrays are the same length.

    Notes: The minimum implementation should return an empty list to indicate no errors were found.
           If additional validation is necessary, method should perform the validation and
           return a list with user-friendly error strings.
    """
    return []

Remove Columns¶

remove_columns_op ¶

Remove columns from a columnar file.

RemoveColumnsOp ¶

Bases: BaseOp

Remove columns from a columnar file.

Required remodeling parameters

column_names (list): The names of the columns to be removed.
ignore_missing (boolean): If True, names in column_names that are not columns in df should be ignored.

Source code in hed/tools/remodeling/operations/remove_columns_op.py

class RemoveColumnsOp(BaseOp):
    """ Remove columns from a columnar file.

    Required remodeling parameters:
        - **column_names** (*list*): The names of the columns to be removed.
        - **ignore_missing** (*boolean*): If True, names in column_names that are not columns in df should be ignored.

    """
    NAME = "remove_columns"

    PARAMS = {
        "type": "object",
        "properties": {
            "column_names": {
                "type": "array",
                "items": {
                    "type": "string"
                },
                "minItems": 1,
                "uniqueItems": True
            },
            "ignore_missing": {
                "type": "boolean"
            }
        },
        "required": [
            "column_names",
            "ignore_missing"
        ],
        "additionalProperties": False
    }

    def __init__(self, parameters):
        """ Constructor for remove columns operation.

        Parameters:
            parameters (dict): Dictionary with the parameter values for required and optional parameters.

        """
        super().__init__(parameters)
        self.column_names = parameters['column_names']
        ignore_missing = parameters['ignore_missing']
        if ignore_missing:
            self.error_handling = 'ignore'
        else:
            self.error_handling = 'raise'

    def do_op(self, dispatcher, df, name, sidecar=None) -> 'pd.DataFrame':
        """ Remove indicated columns from a dataframe.

        Parameters:
            dispatcher (Dispatcher): Manages the operation I/O.
            df (DataFrame): The DataFrame to be remodeled.
            name (str): Unique identifier for the dataframe -- often the original file path.
            sidecar (Sidecar or file-like):  Not needed for this operation.

        Returns:
            pd.DataFrame: A new dataframe after processing.

        :raises KeyError:
            - If ignore_missing is False and a column not in the data is to be removed.

        """
        df_new = df.copy()
        try:
            return df_new.drop(self.column_names, axis=1, errors=self.error_handling)
        except KeyError:
            raise KeyError("MissingColumnCannotBeRemoved",
                           f"{name}: Ignore missing is False but a column in {str(self.column_names)} is "
                           f"not in the data columns [{str(df_new.columns)}]")

    @staticmethod
    def validate_input_data(parameters):
        """ Additional validation required of operation parameters not performed by JSON schema validator. """
        return []

do_op ¶

do_op(dispatcher, df, name, sidecar=None) -> 'pd.DataFrame'

Remove indicated columns from a dataframe.

Parameters:

Name	Type	Description	Default
`dispatcher`	`Dispatcher`	Manages the operation I/O.	required
`df`	`DataFrame`	The DataFrame to be remodeled.	required
`name`	`str`	Unique identifier for the dataframe -- often the original file path.	required
`sidecar`	`Sidecar or file - like`	Not needed for this operation.	`None`

Returns:

Type	Description
`'pd.DataFrame'`	pd.DataFrame: A new dataframe after processing.

:raises KeyError: - If ignore_missing is False and a column not in the data is to be removed.

Source code in hed/tools/remodeling/operations/remove_columns_op.py

def do_op(self, dispatcher, df, name, sidecar=None) -> 'pd.DataFrame':
    """ Remove indicated columns from a dataframe.

    Parameters:
        dispatcher (Dispatcher): Manages the operation I/O.
        df (DataFrame): The DataFrame to be remodeled.
        name (str): Unique identifier for the dataframe -- often the original file path.
        sidecar (Sidecar or file-like):  Not needed for this operation.

    Returns:
        pd.DataFrame: A new dataframe after processing.

    :raises KeyError:
        - If ignore_missing is False and a column not in the data is to be removed.

    """
    df_new = df.copy()
    try:
        return df_new.drop(self.column_names, axis=1, errors=self.error_handling)
    except KeyError:
        raise KeyError("MissingColumnCannotBeRemoved",
                       f"{name}: Ignore missing is False but a column in {str(self.column_names)} is "
                       f"not in the data columns [{str(df_new.columns)}]")

validate_input_data `staticmethod` ¶

validate_input_data(parameters)

Additional validation required of operation parameters not performed by JSON schema validator.

Source code in hed/tools/remodeling/operations/remove_columns_op.py

@staticmethod
def validate_input_data(parameters):
    """ Additional validation required of operation parameters not performed by JSON schema validator. """
    return []

Rename Columns¶

rename_columns_op ¶

Rename columns in a columnar file.

RenameColumnsOp ¶

Bases: BaseOp

Rename columns in a tabular file.

Required remodeling parameters

column_mapping (dict): The names of the columns to be renamed with values to be remapped to.
ignore_missing (bool): If true, the names in column_mapping that are not columns and should be ignored.

Source code in hed/tools/remodeling/operations/rename_columns_op.py

class RenameColumnsOp (BaseOp):
    """ Rename columns in a tabular file.

    Required remodeling parameters:
        - **column_mapping** (*dict*): The names of the columns to be renamed with values to be remapped to.
        - **ignore_missing** (*bool*): If true, the names in column_mapping that are not columns and should be ignored.

    """
    NAME = "rename_columns"

    PARAMS = {
        "type": "object",
        "properties": {
            "column_mapping": {
                "type": "object",
                "description": "Mapping between original column names and their respective new names.",
                "patternProperties": {
                    ".*": {
                        "type": "string"
                    }
                },
                "minProperties": 1
            },
            "ignore_missing": {
                "type": "boolean",
                "description": "If true ignore column_mapping keys that don't correspond to columns, otherwise error."
            }
        },
        "required": [
            "column_mapping",
            "ignore_missing"
        ],
        "additionalProperties": False
    }

    def __init__(self, parameters):
        """ Constructor for rename columns operation.

        Parameters:
            parameters (dict): Dictionary with the parameter values for required and optional parameters

        """
        super().__init__(parameters)
        self.column_mapping = parameters['column_mapping']
        if parameters['ignore_missing']:
            self.error_handling = 'ignore'
        else:
            self.error_handling = 'raise'

    def do_op(self, dispatcher, df, name, sidecar=None) -> 'pd.DataFrame':
        """ Rename columns as specified in column_mapping dictionary.

        Parameters:
            dispatcher (Dispatcher): Manages the operation I/O.
            df (DataFrame): The DataFrame to be remodeled.
            name (str): Unique identifier for the dataframe -- often the original file path.
            sidecar (Sidecar or file-like):  Not needed for this operation.

        Returns:
            pd.Dataframe: A new dataframe after processing.

        :raises KeyError:
            - When ignore_missing is False and column_mapping has columns not in the data.

        """
        df_new = df.copy()
        try:
            return df_new.rename(columns=self.column_mapping, errors=self.error_handling)
        except KeyError:
            raise KeyError("MappedColumnsMissingFromData",
                           f"{name}: ignore_missing is False, mapping columns [{self.column_mapping}]"
                           f" but df columns are [{str(df.columns)}")

    @staticmethod
    def validate_input_data(parameters):
        """ Additional validation required of operation parameters not performed by JSON schema validator. """
        return []

do_op ¶

do_op(dispatcher, df, name, sidecar=None) -> 'pd.DataFrame'

Rename columns as specified in column_mapping dictionary.

Parameters:

Name	Type	Description	Default
`dispatcher`	`Dispatcher`	Manages the operation I/O.	required
`df`	`DataFrame`	The DataFrame to be remodeled.	required
`name`	`str`	Unique identifier for the dataframe -- often the original file path.	required
`sidecar`	`Sidecar or file - like`	Not needed for this operation.	`None`

Returns:

Type	Description
`'pd.DataFrame'`	pd.Dataframe: A new dataframe after processing.

:raises KeyError: - When ignore_missing is False and column_mapping has columns not in the data.

Source code in hed/tools/remodeling/operations/rename_columns_op.py

def do_op(self, dispatcher, df, name, sidecar=None) -> 'pd.DataFrame':
    """ Rename columns as specified in column_mapping dictionary.

    Parameters:
        dispatcher (Dispatcher): Manages the operation I/O.
        df (DataFrame): The DataFrame to be remodeled.
        name (str): Unique identifier for the dataframe -- often the original file path.
        sidecar (Sidecar or file-like):  Not needed for this operation.

    Returns:
        pd.Dataframe: A new dataframe after processing.

    :raises KeyError:
        - When ignore_missing is False and column_mapping has columns not in the data.

    """
    df_new = df.copy()
    try:
        return df_new.rename(columns=self.column_mapping, errors=self.error_handling)
    except KeyError:
        raise KeyError("MappedColumnsMissingFromData",
                       f"{name}: ignore_missing is False, mapping columns [{self.column_mapping}]"
                       f" but df columns are [{str(df.columns)}")

validate_input_data `staticmethod` ¶

validate_input_data(parameters)

Additional validation required of operation parameters not performed by JSON schema validator.

Source code in hed/tools/remodeling/operations/rename_columns_op.py

@staticmethod
def validate_input_data(parameters):
    """ Additional validation required of operation parameters not performed by JSON schema validator. """
    return []

BIDS Tools¶

BIDS Dataset Processing¶

bids ¶

Models for BIDS datasets and files.

Tools¶

Analysis Tools¶

TabularSummary¶

TabularSummary ¶

extract_sidecar_template ¶

extract_summary staticmethod ¶

get_columns_info staticmethod ¶

get_number_unique ¶

get_summary ¶

make_combined_dicts staticmethod ¶

update ¶

update_summary ¶

Annotation Utilities¶

annotation_util ¶

check_df_columns ¶

df_to_hed ¶

extract_tags ¶

generate_sidecar_entry ¶

hed_to_df ¶

merge_hed_dict ¶

series_to_factor ¶

str_to_tabular ¶

strs_to_hed_objs ¶

strs_to_sidecar ¶

to_factor ¶

to_strlist ¶

Remodeling Operations¶

Base Operations¶

base_op ¶

BaseOp ¶

do_op abstractmethod ¶

validate_input_data abstractmethod staticmethod ¶

Remove Columns¶

remove_columns_op ¶

RemoveColumnsOp ¶

do_op ¶

validate_input_data staticmethod ¶

Rename Columns¶

rename_columns_op ¶

RenameColumnsOp ¶

do_op ¶

validate_input_data staticmethod ¶

BIDS Tools¶

BIDS Dataset Processing¶

bids ¶

extract_summary `staticmethod` ¶

get_columns_info `staticmethod` ¶

make_combined_dicts `staticmethod` ¶

do_op `abstractmethod` ¶

validate_input_data `abstractmethod` `staticmethod` ¶

validate_input_data `staticmethod` ¶

validate_input_data `staticmethod` ¶