Skip to content

Tools

Utility tools and scripts for working with HED data.

Analysis Tools

TabularSummary

TabularSummary

Summarize the contents of columnar files.

Source code in hed/tools/analysis/tabular_summary.py
class TabularSummary:
    """ Summarize the contents of columnar files. """

    def __init__(self, value_cols=None, skip_cols=None, name=''):
        """ Constructor for a BIDS tabular file summary.

        Parameters:
            value_cols (list, None):  List of columns to be treated as value columns.
            skip_cols (list, None):   List of columns to be skipped.
            name (str):               Name associated with the dictionary.

        """

        self.name = name
        self.categorical_info = {}
        self.value_info = {}
        if value_cols and skip_cols and set(value_cols).intersection(skip_cols):
            raise HedFileError("ValueSkipOverlap",
                               f"Value columns {str(value_cols)} and skip columns {str(skip_cols)} cannot overlap", "")
        if value_cols:
            for value in value_cols:
                self.value_info[value] = [0, 0]
        if skip_cols:
            self.skip_cols = skip_cols.copy()
        else:
            self.skip_cols = []
        self.total_files = 0
        self.total_events = 0
        self.files = {}

    def __str__(self):
        """ Return a str version of this summary.
        """
        indent = "   "
        summary_list = [f"Summary for column dictionary {self.name}:"]
        sorted_keys = sorted(self.categorical_info.keys())
        summary_list.append(f"{indent}Categorical columns ({len(sorted_keys)}):")
        for key in sorted_keys:
            value_dict = self.categorical_info[key]
            sorted_v_keys = sorted(list(value_dict))
            summary_list.append(f"{indent * 2}{key} ({len(sorted_v_keys)} distinct values):")
            for v_key in sorted_v_keys:
                summary_list.append(f"{indent * 3}{v_key}: {value_dict[v_key]}")

        sorted_cols = sorted(map(str, list(self.value_info)))
        summary_list.append(f"{indent}Value columns ({len(sorted_cols)}):")
        for key in sorted_cols:
            summary_list.append(f"{indent * 2}{key}: {self.value_info[key]}")
        return "\n".join(summary_list)

    def extract_sidecar_template(self) -> dict:
        """ Extract a BIDS sidecar-compatible dictionary.

        Returns:
            dict: A sidecar template that can be converted to JSON.

        """
        side_dict = {}
        for column_name, columns in self.categorical_info.items():
            column_values = list(columns.keys())
            column_values.sort()
            side_dict[column_name] = annotation_util.generate_sidecar_entry(column_name, column_values)

        for column_name in self.value_info.keys():
            side_dict[column_name] = annotation_util.generate_sidecar_entry(column_name, [])
        return side_dict

    def get_summary(self, as_json=False) -> Union[dict, str]:
        """ Return the summary in dictionary format.

        Parameters:
            as_json (bool): If False, return as a Python dictionary, otherwise convert to a JSON dictionary.

        Returns:
            Union[dict, str]: A dictionary containing the summary information or a JSON string if as_json is True.
        """
        sorted_keys = sorted(self.categorical_info.keys())
        categorical_cols = {}
        for key in sorted_keys:
            cat_dict = self.categorical_info[key]
            sorted_v_keys = sorted(list(cat_dict))
            val_dict = {}
            for v_key in sorted_v_keys:
                val_dict[v_key] = cat_dict[v_key]
            categorical_cols[key] = val_dict
        sorted_cols = sorted(map(str, list(self.value_info)))
        value_cols = {}
        for key in sorted_cols:
            value_cols[key] = self.value_info[key]
        summary = {"Name": self.name, "Total events": self.total_events, "Total files": self.total_files,
                   "Categorical columns": categorical_cols, "Value columns": value_cols,
                   "Skip columns": self.skip_cols, "Files": self.files}
        if as_json:
            return json.dumps(summary, indent=4)
        else:
            return summary

    def get_number_unique(self, column_names=None) -> dict:
        """ Return the number of unique values in columns.

        Parameters:
            column_names (list, None):   A list of column names to analyze or all columns if None.

        Returns:
            dict: Column names are the keys and the number of unique values in the column are the values.

        """
        if not column_names:
            column_names = list(self.categorical_info.keys())
        counts = {}
        for column_name in column_names:
            if column_name not in self.categorical_info:
                counts[column_name] = 'n/a'
            else:
                counts[column_name] = len(self.categorical_info[column_name].keys())
        return counts

    def update(self, data, name=None):
        """ Update the counts based on data.

        Parameters:
            data (DataFrame, str, or list):    DataFrame containing data to update.
            name (str): Name of the summary.

        """

        if isinstance(data, list):
            for filename in data:
                self._update_dataframe(filename, filename)
        elif isinstance(data, str):
            self._update_dataframe(data, data)
        else:
            self._update_dataframe(data, name)

    def update_summary(self, tab_sum):
        """ Add TabularSummary values to this object.

        Parameters:
            tab_sum (TabularSummary):   A TabularSummary to be combined.

        Notes:
            - The value_cols and skip_cols are updated as long as they are not contradictory.
            - A new skip column cannot be used.

        """
        self.total_files = self.total_files + tab_sum.total_files
        self.total_events = self.total_events + tab_sum.total_events
        for file, key in tab_sum.files.items():
            self.files[file] = ''
        self._update_dict_skip(tab_sum)
        self._update_dict_value(tab_sum)
        self._update_dict_categorical(tab_sum)

    def _update_categorical(self, tab_name, values):
        """ Update the categorical information for this summary.

        Parameters:
            tab_name (str): Name of a key indicating a categorical column.
            values (dict): A dictionary whose keys are unique categorical values.

        """
        if tab_name not in self.categorical_info:
            self.categorical_info[tab_name] = {}

        total_values = self.categorical_info[tab_name]
        for name, value in values.items():
            value_list = total_values.get(name, [0, 0])
            if not isinstance(value, list):
                value = [value, 1]
            total_values[name] = [value_list[0] + value[0], value_list[1] + value[1]]

    def _update_dataframe(self, data, name):
        """ Update the information based on columnar data.

        Parameters:
            data (DataFrame, str):  Columnar data (either DataFrame or filename) whose columns are to be summarized.
            name (str): Name of the file corresponding to data.

        """
        df = data_util.get_new_dataframe(data)
        if name:
            self.files[name] = ""
        self.total_files = self.total_files + 1
        self.total_events = self.total_events + len(df.index)
        for col_name, col_values in df.items():
            if self.skip_cols and col_name in self.skip_cols:
                continue
            if col_name in self.value_info.keys():
                self.value_info[col_name][0] = self.value_info[col_name][0] + len(col_values)
                self.value_info[col_name][1] = self.value_info[col_name][1] + 1
            else:
                col_values = col_values.astype(str)
                values = col_values.value_counts(ascending=True)
                self._update_categorical(col_name,  values)

    def _update_dict_categorical(self, col_dict):
        """ Update this summary with the categorical information in the dictionary from another summary.

        Parameters:
            col_dict (TabularSummary):  Summary information from another tabular summary.

        """
        new_cat_cols = col_dict.categorical_info.keys()
        if not new_cat_cols:
            return
        val_cols = self.value_info.keys()
        for col in new_cat_cols:
            if col in val_cols:
                raise HedFileError("CatColShouldBeValueCol",
                                   f"Categorical column [{str(col)}] is already a value column", "")
            elif col in self.skip_cols:
                continue
            else:
                self._update_categorical(col, col_dict.categorical_info[col])

    def _update_dict_skip(self, col_dict):
        """ Update this summary with the skip column information from another summary.

        Parameters:
            col_dict (TabularSummary):  Summary information from another tabular summary.

        """

        if not col_dict.skip_cols:
            return
        cat_cols = self.categorical_info.keys()
        val_cols = self.value_info.keys()
        for col in col_dict.skip_cols:
            if col in cat_cols or col in val_cols:
                raise HedFileError("SkipColInvalid",
                                   f"Skip column [{str(col)}] is already a categorical or value column", "")
            elif col not in self.skip_cols:
                self.skip_cols.append(col)

    def _update_dict_value(self, col_dict):
        """ Update this summary with the value column information from another summary.

        Parameters:
             col_dict (TabularSummary):  Summary information from another tabular summary.

        """
        new_value_cols = col_dict.value_info.keys()
        if not new_value_cols:
            return
        cat_cols = self.categorical_info.keys()
        val_cols = self.value_info.keys()
        for col in new_value_cols:
            if col in cat_cols:
                raise HedFileError("ValueColIsCatCol", f"Value column [{str(col)}] is already a categorical column", "")
            elif col in self.skip_cols:
                continue
            elif col not in val_cols:
                self.value_info[col] = col_dict.value_info[col]
            else:
                self.value_info[col] = [self.value_info[col][0] + col_dict.value_info[col][0],
                                        self.value_info[col][1] + col_dict.value_info[col][1]]

    @staticmethod
    def extract_summary(summary_info) -> 'TabularSummary':
        """ Create a TabularSummary object from a serialized summary.

        Parameters:
            summary_info (dict or str):  A JSON string or a dictionary containing contents of a TabularSummary.

        Returns:
            TabularSummary:  contains the information in summary_info as a TabularSummary object.
        """

        if isinstance(summary_info, str):
            summary_info = json.loads(summary_info)
        new_tab = TabularSummary(value_cols=summary_info.get('Value columns', {}).keys(),
                                 skip_cols=summary_info.get('Skip columns', []),
                                 name=summary_info.get('Summary name', ''))
        new_tab.value_info = summary_info.get('Value_columns', {})
        new_tab.total_files = summary_info.get('Total files', 0)
        new_tab.total_events = summary_info.get('Total events', 0)
        new_tab.skip_cols = summary_info.get('Skip columns', [])
        new_tab.categorical_info = summary_info.get('Categorical columns', {})
        new_tab.files = summary_info.get('Files', {})
        return new_tab

    @staticmethod
    def get_columns_info(dataframe, skip_cols=None) -> dict[str, dict]:
        """ Extract unique value counts for columns.

        Parameters:
            dataframe (DataFrame): The DataFrame to be analyzed.
            skip_cols (list): List of names of columns to be skipped in the extraction.

        Returns:
            dict[str, dict]: A dictionary with keys that are column names (strings) and values that
                           are dictionaries of unique value counts.

        """
        col_info = dict()

        for col_name, col_values in dataframe.items():
            if skip_cols and col_name in skip_cols:
                continue
            col_info[col_name] = col_values.value_counts(ascending=True).to_dict()
        return col_info

    @staticmethod
    def make_combined_dicts(file_dictionary, skip_cols=None) -> tuple['TabularSummary', dict[str, 'TabularSummary']]:
        """ Return combined and individual summaries.

        Parameters:
            file_dictionary (FileDictionary): Dictionary of file name keys and full path.
            skip_cols (list): Name of the column.

        Returns:
            tuple:
                - TabularSummary: A combined summary of all files in the dictionary.
                - dict[str, TabularSummary]: A dictionary where keys are file names and values are individual TabularSummary objects.

        """

        summary_all = TabularSummary(skip_cols=skip_cols)
        summary_dict = {}
        for key, file_path in file_dictionary.items():
            orig_dict = TabularSummary(skip_cols=skip_cols)
            df = data_util.get_new_dataframe(file_path)
            orig_dict.update(df)
            summary_dict[key] = orig_dict
            summary_all.update_summary(orig_dict)
        return summary_all, summary_dict

extract_sidecar_template

extract_sidecar_template() -> dict

Extract a BIDS sidecar-compatible dictionary.

Returns:

Name Type Description
dict dict

A sidecar template that can be converted to JSON.

Source code in hed/tools/analysis/tabular_summary.py
def extract_sidecar_template(self) -> dict:
    """ Extract a BIDS sidecar-compatible dictionary.

    Returns:
        dict: A sidecar template that can be converted to JSON.

    """
    side_dict = {}
    for column_name, columns in self.categorical_info.items():
        column_values = list(columns.keys())
        column_values.sort()
        side_dict[column_name] = annotation_util.generate_sidecar_entry(column_name, column_values)

    for column_name in self.value_info.keys():
        side_dict[column_name] = annotation_util.generate_sidecar_entry(column_name, [])
    return side_dict

extract_summary staticmethod

extract_summary(summary_info) -> TabularSummary

Create a TabularSummary object from a serialized summary.

Parameters:

Name Type Description Default
summary_info dict or str

A JSON string or a dictionary containing contents of a TabularSummary.

required

Returns:

Name Type Description
TabularSummary TabularSummary

contains the information in summary_info as a TabularSummary object.

Source code in hed/tools/analysis/tabular_summary.py
@staticmethod
def extract_summary(summary_info) -> 'TabularSummary':
    """ Create a TabularSummary object from a serialized summary.

    Parameters:
        summary_info (dict or str):  A JSON string or a dictionary containing contents of a TabularSummary.

    Returns:
        TabularSummary:  contains the information in summary_info as a TabularSummary object.
    """

    if isinstance(summary_info, str):
        summary_info = json.loads(summary_info)
    new_tab = TabularSummary(value_cols=summary_info.get('Value columns', {}).keys(),
                             skip_cols=summary_info.get('Skip columns', []),
                             name=summary_info.get('Summary name', ''))
    new_tab.value_info = summary_info.get('Value_columns', {})
    new_tab.total_files = summary_info.get('Total files', 0)
    new_tab.total_events = summary_info.get('Total events', 0)
    new_tab.skip_cols = summary_info.get('Skip columns', [])
    new_tab.categorical_info = summary_info.get('Categorical columns', {})
    new_tab.files = summary_info.get('Files', {})
    return new_tab

get_columns_info staticmethod

get_columns_info(
    dataframe, skip_cols=None
) -> dict[str, dict]

Extract unique value counts for columns.

Parameters:

Name Type Description Default
dataframe DataFrame

The DataFrame to be analyzed.

required
skip_cols list

List of names of columns to be skipped in the extraction.

None

Returns:

Type Description
dict[str, dict]

dict[str, dict]: A dictionary with keys that are column names (strings) and values that are dictionaries of unique value counts.

Source code in hed/tools/analysis/tabular_summary.py
@staticmethod
def get_columns_info(dataframe, skip_cols=None) -> dict[str, dict]:
    """ Extract unique value counts for columns.

    Parameters:
        dataframe (DataFrame): The DataFrame to be analyzed.
        skip_cols (list): List of names of columns to be skipped in the extraction.

    Returns:
        dict[str, dict]: A dictionary with keys that are column names (strings) and values that
                       are dictionaries of unique value counts.

    """
    col_info = dict()

    for col_name, col_values in dataframe.items():
        if skip_cols and col_name in skip_cols:
            continue
        col_info[col_name] = col_values.value_counts(ascending=True).to_dict()
    return col_info

get_number_unique

get_number_unique(column_names=None) -> dict

Return the number of unique values in columns.

Parameters:

Name Type Description Default
column_names (list, None)

A list of column names to analyze or all columns if None.

None

Returns:

Name Type Description
dict dict

Column names are the keys and the number of unique values in the column are the values.

Source code in hed/tools/analysis/tabular_summary.py
def get_number_unique(self, column_names=None) -> dict:
    """ Return the number of unique values in columns.

    Parameters:
        column_names (list, None):   A list of column names to analyze or all columns if None.

    Returns:
        dict: Column names are the keys and the number of unique values in the column are the values.

    """
    if not column_names:
        column_names = list(self.categorical_info.keys())
    counts = {}
    for column_name in column_names:
        if column_name not in self.categorical_info:
            counts[column_name] = 'n/a'
        else:
            counts[column_name] = len(self.categorical_info[column_name].keys())
    return counts

get_summary

get_summary(as_json=False) -> Union[dict, str]

Return the summary in dictionary format.

Parameters:

Name Type Description Default
as_json bool

If False, return as a Python dictionary, otherwise convert to a JSON dictionary.

False

Returns:

Type Description
Union[dict, str]

Union[dict, str]: A dictionary containing the summary information or a JSON string if as_json is True.

Source code in hed/tools/analysis/tabular_summary.py
def get_summary(self, as_json=False) -> Union[dict, str]:
    """ Return the summary in dictionary format.

    Parameters:
        as_json (bool): If False, return as a Python dictionary, otherwise convert to a JSON dictionary.

    Returns:
        Union[dict, str]: A dictionary containing the summary information or a JSON string if as_json is True.
    """
    sorted_keys = sorted(self.categorical_info.keys())
    categorical_cols = {}
    for key in sorted_keys:
        cat_dict = self.categorical_info[key]
        sorted_v_keys = sorted(list(cat_dict))
        val_dict = {}
        for v_key in sorted_v_keys:
            val_dict[v_key] = cat_dict[v_key]
        categorical_cols[key] = val_dict
    sorted_cols = sorted(map(str, list(self.value_info)))
    value_cols = {}
    for key in sorted_cols:
        value_cols[key] = self.value_info[key]
    summary = {"Name": self.name, "Total events": self.total_events, "Total files": self.total_files,
               "Categorical columns": categorical_cols, "Value columns": value_cols,
               "Skip columns": self.skip_cols, "Files": self.files}
    if as_json:
        return json.dumps(summary, indent=4)
    else:
        return summary

make_combined_dicts staticmethod

make_combined_dicts(
    file_dictionary, skip_cols=None
) -> tuple[TabularSummary, dict[str, TabularSummary]]

Return combined and individual summaries.

Parameters:

Name Type Description Default
file_dictionary FileDictionary

Dictionary of file name keys and full path.

required
skip_cols list

Name of the column.

None

Returns:

Name Type Description
tuple tuple[TabularSummary, dict[str, TabularSummary]]
  • TabularSummary: A combined summary of all files in the dictionary.
  • dict[str, TabularSummary]: A dictionary where keys are file names and values are individual TabularSummary objects.
Source code in hed/tools/analysis/tabular_summary.py
@staticmethod
def make_combined_dicts(file_dictionary, skip_cols=None) -> tuple['TabularSummary', dict[str, 'TabularSummary']]:
    """ Return combined and individual summaries.

    Parameters:
        file_dictionary (FileDictionary): Dictionary of file name keys and full path.
        skip_cols (list): Name of the column.

    Returns:
        tuple:
            - TabularSummary: A combined summary of all files in the dictionary.
            - dict[str, TabularSummary]: A dictionary where keys are file names and values are individual TabularSummary objects.

    """

    summary_all = TabularSummary(skip_cols=skip_cols)
    summary_dict = {}
    for key, file_path in file_dictionary.items():
        orig_dict = TabularSummary(skip_cols=skip_cols)
        df = data_util.get_new_dataframe(file_path)
        orig_dict.update(df)
        summary_dict[key] = orig_dict
        summary_all.update_summary(orig_dict)
    return summary_all, summary_dict

update

update(data, name=None)

Update the counts based on data.

Parameters:

Name Type Description Default
data DataFrame, str, or list

DataFrame containing data to update.

required
name str

Name of the summary.

None
Source code in hed/tools/analysis/tabular_summary.py
def update(self, data, name=None):
    """ Update the counts based on data.

    Parameters:
        data (DataFrame, str, or list):    DataFrame containing data to update.
        name (str): Name of the summary.

    """

    if isinstance(data, list):
        for filename in data:
            self._update_dataframe(filename, filename)
    elif isinstance(data, str):
        self._update_dataframe(data, data)
    else:
        self._update_dataframe(data, name)

update_summary

update_summary(tab_sum)

Add TabularSummary values to this object.

Parameters:

Name Type Description Default
tab_sum TabularSummary

A TabularSummary to be combined.

required
Notes
  • The value_cols and skip_cols are updated as long as they are not contradictory.
  • A new skip column cannot be used.
Source code in hed/tools/analysis/tabular_summary.py
def update_summary(self, tab_sum):
    """ Add TabularSummary values to this object.

    Parameters:
        tab_sum (TabularSummary):   A TabularSummary to be combined.

    Notes:
        - The value_cols and skip_cols are updated as long as they are not contradictory.
        - A new skip column cannot be used.

    """
    self.total_files = self.total_files + tab_sum.total_files
    self.total_events = self.total_events + tab_sum.total_events
    for file, key in tab_sum.files.items():
        self.files[file] = ''
    self._update_dict_skip(tab_sum)
    self._update_dict_value(tab_sum)
    self._update_dict_categorical(tab_sum)

Annotation Utilities

annotation_util

Utilities to facilitate annotation of events in BIDS.

check_df_columns

check_df_columns(
    df,
    required_cols=(
        "column_name",
        "column_value",
        "description",
        "HED",
    ),
) -> list[str]

Return a list of the specified columns that are missing from a dataframe.

Parameters:

Name Type Description Default
df DataFrame

Spreadsheet to check the columns of.

required
required_cols tuple

List of column names that must be present.

('column_name', 'column_value', 'description', 'HED')

Returns:

Type Description
list[str]

list[str]: List of column names that are missing.

Source code in hed/tools/analysis/annotation_util.py
def check_df_columns(df, required_cols=('column_name', 'column_value', 'description', 'HED')) -> list[str]:
    """ Return a list of the specified columns that are missing from a dataframe.

    Parameters:
        df (DataFrame):  Spreadsheet to check the columns of.
        required_cols (tuple):  List of column names that must be present.

    Returns:
        list[str]:   List of column names that are missing.

    """
    missing_cols = []
    column_list = list(df.columns.values)
    for col in required_cols:
        if col not in column_list:
            missing_cols.append(col)
    return missing_cols

df_to_hed

df_to_hed(dataframe, description_tag=True) -> dict

Create sidecar-like dictionary from a 4-column dataframe.

Parameters:

Name Type Description Default
dataframe DataFrame

A four-column Pandas DataFrame with specific columns.

required
description_tag bool

If True description tag is included.

True

Returns:

Name Type Description
dict dict

A dictionary compatible with BIDS JSON tabular file that includes HED.

Notes
  • The DataFrame must have the columns with names: column_name, column_value, description, and HED.
Source code in hed/tools/analysis/annotation_util.py
def df_to_hed(dataframe, description_tag=True) -> dict:
    """ Create sidecar-like dictionary from a 4-column dataframe.

    Parameters:
        dataframe (DataFrame):   A four-column Pandas DataFrame with specific columns.
        description_tag (bool):  If True description tag is included.

    Returns:
        dict:  A dictionary compatible with BIDS JSON tabular file that includes HED.

    Notes:
        - The DataFrame must have the columns with names: column_name, column_value, description, and HED.

    """
    df = dataframe.fillna('n/a')
    missing_cols = check_df_columns(df)
    if missing_cols:
        raise HedFileError("RequiredColumnsMissing", f"Columns {str(missing_cols)} are missing from dataframe", "")
    hed_dict = {}
    for index, row in df.iterrows():
        if row['HED'] == 'n/a' and row['description'] == 'n/a':
            continue
        if row['column_value'] == 'n/a':
            hed_dict[row['column_name']] = _get_value_entry(row['HED'], row['description'],
                                                            description_tag=description_tag)
            continue
        cat_dict = hed_dict.get(row['column_name'], {})
        _update_cat_dict(cat_dict, row['column_value'], row['HED'], row['description'],
                         description_tag=description_tag)
        hed_dict[row['column_name']] = cat_dict
    return hed_dict

extract_tags

extract_tags(
    hed_string, search_tag
) -> tuple[str, list[str]]

Extract all instances of specified tag from a tag_string.

Parameters:

Name Type Description Default
hed_string str

Tag string from which to extract tag.

required
search_tag str

HED tag to extract.

required

Returns:

Type Description
tuple[str, list[str]]

tuple[str, list[str] - Tag string without the tags. - A list of the tags that were extracted, for example descriptions.

Source code in hed/tools/analysis/annotation_util.py
def extract_tags(hed_string, search_tag) -> tuple[str, list[str]]:
    """ Extract all instances of specified tag from a tag_string.

        Parameters:
           hed_string (str):   Tag string from which to extract tag.
           search_tag (str):   HED tag to extract.

        Returns:
            tuple[str, list[str]
                - Tag string without the tags.
                - A list of the tags that were extracted, for example descriptions.

    """
    possible_descriptions = hed_string.replace(")", "").replace("(", "").split(",")
    extracted = [tag.strip() for tag in possible_descriptions if search_tag in tag]
    remainder = hed_string
    for tag in extracted:
        remainder = df_util.replace_ref(remainder, tag)

    return remainder, extracted

generate_sidecar_entry

generate_sidecar_entry(
    column_name, column_values=None
) -> dict

Create a sidecar column dictionary for column.

Parameters:

Name Type Description Default
column_name str

Name of the column.

required
column_values list

List of column values.

None

Returns: dict: A dictionary representing a template for a sidecar entry.

Source code in hed/tools/analysis/annotation_util.py
def generate_sidecar_entry(column_name, column_values=None) -> dict:
    """ Create a sidecar column dictionary for column.

    Parameters:
        column_name (str):       Name of the column.
        column_values (list):    List of column values.

     Returns:
         dict:   A dictionary representing a template for a sidecar entry.

    """

    name_label = re.sub(r'[^A-Za-z0-9-]+', '_', column_name)
    sidecar_entry = {"Description": f"Description for {column_name}", "HED": ""}
    if not column_values:
        sidecar_entry["HED"] = f"(Label/{name_label}, ID/#)"
    else:
        levels = {}
        hed = {}
        for column_value in column_values:
            if column_value == "n/a":
                continue
            value_label = re.sub(r'[^A-Za-z0-9-]+', '_', column_value)
            levels[column_value] = f"Here describe column value {column_value} of column {column_name}"
            hed[column_value] = f"(Label/{name_label}, ID/{value_label})"
        sidecar_entry["Levels"] = levels
        sidecar_entry["HED"] = hed
    return sidecar_entry

hed_to_df

hed_to_df(sidecar_dict, col_names=None) -> DataFrame

Return a 4-column dataframe of HED portions of sidecar.

Parameters:

Name Type Description Default
sidecar_dict dict

A dictionary conforming to BIDS JSON events sidecar format.

required
col_names (list, None)

A list of the cols to include in the flattened sidecar.

None

Returns:

Name Type Description
DataFrame DataFrame

Four-column spreadsheet representing HED portion of sidecar.

Notes
  • The returned DataFrame has columns: column_name, column_value, description, and HED.
Source code in hed/tools/analysis/annotation_util.py
def hed_to_df(sidecar_dict, col_names=None) -> DataFrame:
    """ Return a 4-column dataframe of HED portions of sidecar.

    Parameters:
        sidecar_dict (dict):      A dictionary conforming to BIDS JSON events sidecar format.
        col_names (list, None):   A list of the cols to include in the flattened sidecar.

    Returns:
        DataFrame:  Four-column spreadsheet representing HED portion of sidecar.

    Notes:
        - The returned DataFrame has columns: column_name, column_value, description, and HED.

    """

    if not col_names:
        col_names = sidecar_dict.keys()
    column_name = []
    column_value = []
    column_description = []
    hed_tags = []

    for col_key, col_dict in sidecar_dict.items():
        if col_key not in col_names or not isinstance(col_dict, dict) or 'HED' not in col_dict:
            continue
        elif 'Levels' in col_dict or isinstance(col_dict['HED'], dict):
            keys, values, descriptions, tags = _flatten_cat_col(col_key, col_dict)
        else:
            keys, values, descriptions, tags = _flatten_val_col(col_key, col_dict)
        column_name = column_name + keys
        column_value = column_value + values
        column_description = column_description + descriptions
        hed_tags = hed_tags + tags

    data = {"column_name": column_name, "column_value": column_value,
            "description": column_description, "HED": hed_tags}
    dataframe = pd.DataFrame(data).astype(str)
    return dataframe

merge_hed_dict

merge_hed_dict(sidecar_dict, hed_dict)

Update a JSON sidecar based on the hed_dict values.

Parameters:

Name Type Description Default
sidecar_dict dict

Dictionary representation of a BIDS JSON sidecar.

required
hed_dict dict

Dictionary derived from a dataframe representation of HED in sidecar.

required
Source code in hed/tools/analysis/annotation_util.py
def merge_hed_dict(sidecar_dict, hed_dict):
    """ Update a JSON sidecar based on the hed_dict values.

    Parameters:
        sidecar_dict (dict):  Dictionary representation of a BIDS JSON sidecar.
        hed_dict (dict):       Dictionary derived from a dataframe representation of HED in sidecar.

    """

    for key, value_dict in hed_dict.items():
        if key not in sidecar_dict:
            sidecar_dict[key] = value_dict
            continue
        sidecar_dict[key]['HED'] = value_dict['HED']
        if isinstance(value_dict['HED'], str) and value_dict.get('Description', "n/a") != "n/a":
            sidecar_dict[key]['Description'] = value_dict['Description']
            continue
        if isinstance(value_dict['HED'], dict) and 'Levels' in value_dict:
            sidecar_dict[key]['Levels'] = value_dict['Levels']

series_to_factor

series_to_factor(series) -> list[int]

Convert a series to an integer factor list.

Parameters:

Name Type Description Default
series Series

Series to be converted to a list.

required

Returns:

Type Description
list[int]

list[int] - contains 0's and 1's, empty, 'n/a' and np.nan are converted to 0.

Source code in hed/tools/analysis/annotation_util.py
def series_to_factor(series) -> list[int]:
    """Convert a series to an integer factor list.

    Parameters:
        series (pd.Series): Series to be converted to a list.

    Returns:
        list[int] - contains 0's and 1's, empty, 'n/a' and np.nan are converted to 0.
    """
    replaced = series.replace('n/a', False)
    filled = replaced.fillna(False)
    bool_list = filled.astype(bool).tolist()
    return [int(value) for value in bool_list]

str_to_tabular

str_to_tabular(tsv_str, sidecar=None) -> TabularInput

Return a TabularInput a tsv string.

Parameters:

Name Type Description Default
tsv_str str

A string representing a tabular input.

required
sidecar (Sidecar, str, File or File - like)

An optional Sidecar object.

None

Returns: TabularInput: Represents a tabular input object.

Source code in hed/tools/analysis/annotation_util.py
def str_to_tabular(tsv_str, sidecar=None) -> TabularInput:
    """ Return a TabularInput a tsv string.

    Parameters:
        tsv_str (str):  A string representing a tabular input.
        sidecar (Sidecar, str, File or File-like): An optional Sidecar object.

     Returns:
         TabularInput:  Represents a tabular input object.
     """

    return TabularInput(file=io.StringIO(tsv_str), sidecar=sidecar)

strs_to_hed_objs

strs_to_hed_objs(
    hed_strings, hed_schema
) -> Union[list[HedString], None]

Returns a list of HedString objects from a list of strings.

Parameters:

Name Type Description Default
hed_strings string or list

String or strings representing HED annotations.

required
hed_schema HedSchema or HedSchemaGroup

Schema version for the strings.

required

Returns:

Type Description
Union[list[HedString], None]

Union[list[HedString], None]: A list of HedString objects or None.

Source code in hed/tools/analysis/annotation_util.py
def strs_to_hed_objs(hed_strings, hed_schema) -> Union[list[HedString], None]:
    """ Returns a list of HedString objects from a list of strings.

     Parameters:
         hed_strings (string or list):  String or strings representing HED annotations.
         hed_schema (HedSchema or HedSchemaGroup): Schema version for the strings.

     Returns:
         Union[list[HedString], None]:  A list of HedString objects or None.

     """
    if not hed_strings:
        return None
    if not isinstance(hed_strings, list):
        hed_strings = [hed_strings]
    if hed_strings:
        return [HedString(hed, hed_schema=hed_schema) for hed in hed_strings]
    else:
        return None

strs_to_sidecar

strs_to_sidecar(sidecar_strings) -> Union[Sidecar, None]

Return a Sidecar from a sidecar as string or as a list of sidecars as strings.

Parameters:

Name Type Description Default
sidecar_strings string or list

String or strings representing sidecars.

required

Returns:

Type Description
Union[Sidecar, None]

Union[Sidecar, None]: the merged sidecar from the list.

Source code in hed/tools/analysis/annotation_util.py
def strs_to_sidecar(sidecar_strings) -> Union[Sidecar, None]:
    """ Return a Sidecar from a sidecar as string or as a list of sidecars as strings.

     Parameters:
         sidecar_strings (string or list):  String or strings representing sidecars.

     Returns:
         Union[Sidecar, None]:  the merged sidecar from the list.
     """

    if not sidecar_strings:
        return None
    if not isinstance(sidecar_strings, list):
        sidecar_strings = [sidecar_strings]
    if sidecar_strings:
        file_list = []
        for s_string in sidecar_strings:
            file_list.append(io.StringIO(s_string))
        return Sidecar(files=file_list, name="Merged_Sidecar")
    else:
        return None

to_factor

to_factor(data, column=None) -> list[int]

Convert data to an integer factor list.

Parameters:

Name Type Description Default
data Series or DataFrame

Series or DataFrame to be converted to a list.

required
column str

Column name if DataFrame, otherwise column 0 is used.

None

Returns:

Type Description
list[int]

list[int]: A list containing 0's and 1's. Empty, 'n/a', and np.nan values are converted to 0.

Source code in hed/tools/analysis/annotation_util.py
def to_factor(data, column=None) -> list[int]:
    """Convert data to an integer factor list.

    Parameters:
        data (Series or DataFrame): Series or DataFrame to be converted to a list.
        column (str, optional): Column name if DataFrame, otherwise column 0 is used.

    Returns:
        list[int]: A list containing 0's and 1's. Empty, 'n/a', and np.nan values are converted to 0.

    """
    if isinstance(data, Series):
        series = data
    elif isinstance(data, DataFrame) and column:
        series = data[column]
    elif isinstance(data, DataFrame):
        series = data.iloc[:, 0]
    else:
        raise HedFileError("CannotConvertToFactor",
                           f"Expecting Series or DataFrame but got {type(data)}", "")

    replaced = series.replace('n/a', False)
    filled = replaced.fillna(False)
    bool_list = filled.astype(bool).tolist()
    return [int(value) for value in bool_list]

to_strlist

to_strlist(obj_list) -> list[str]

Convert objects in a list to strings, preserving None values.

Parameters:

Name Type Description Default
obj_list list

A list of objects that are None or have a str method.

required

Returns:

Type Description
list[str]

list[str]: A list with the objects converted to strings. None values are preserved as empty strings.

Source code in hed/tools/analysis/annotation_util.py
def to_strlist(obj_list) -> list[str]:
    """Convert objects in a list to strings, preserving None values.

    Parameters:
        obj_list (list): A list of objects that are None or have a str method.

    Returns:
        list[str]: A list with the objects converted to strings. None values are preserved as empty strings.

    """
    # Using list comprehension to convert non-None items to strings
    return [str(item) if item is not None else '' for item in obj_list]

Remodeling Operations

Base Operations

base_op

Base class for remodeling operations.

BaseOp

Bases: ABC

Base class for operations. All remodeling operations should extend this class.

Source code in hed/tools/remodeling/operations/base_op.py
class BaseOp(ABC):
    """ Base class for operations. All remodeling operations should extend this class."""

    def __init__(self, parameters):
        """ Constructor for the BaseOp class. Should be extended by operations.

        Parameters:
            parameters (dict): A dictionary specifying the appropriate parameters for the operation.
        """
        self.parameters = parameters

    @property
    @abstractmethod
    def NAME(self):
        pass

    @property
    @abstractmethod
    def PARAMS(self):
        pass

    @abstractmethod
    def do_op(self, dispatcher, df, name, sidecar=None):
        """ Base class method to be overridden by each operation.

        Parameters:
            dispatcher (Dispatcher): Manages the operation I/O.
            df (DataFrame): The tabular file to be remodeled.
            name (str): Unique identifier for the data -- often the original file path.
            sidecar (Sidecar or file-like):  A JSON sidecar needed for HED operations.

        """

        return df.copy()

    @staticmethod
    @abstractmethod
    def validate_input_data(parameters):
        """ Validates whether operation parameters meet op-specific criteria beyond that captured in json schema.

        Example: A check to see whether two input arrays are the same length.

        Notes: The minimum implementation should return an empty list to indicate no errors were found.
               If additional validation is necessary, method should perform the validation and
               return a list with user-friendly error strings.
        """
        return []

do_op abstractmethod

do_op(dispatcher, df, name, sidecar=None)

Base class method to be overridden by each operation.

Parameters:

Name Type Description Default
dispatcher Dispatcher

Manages the operation I/O.

required
df DataFrame

The tabular file to be remodeled.

required
name str

Unique identifier for the data -- often the original file path.

required
sidecar Sidecar or file - like

A JSON sidecar needed for HED operations.

None
Source code in hed/tools/remodeling/operations/base_op.py
@abstractmethod
def do_op(self, dispatcher, df, name, sidecar=None):
    """ Base class method to be overridden by each operation.

    Parameters:
        dispatcher (Dispatcher): Manages the operation I/O.
        df (DataFrame): The tabular file to be remodeled.
        name (str): Unique identifier for the data -- often the original file path.
        sidecar (Sidecar or file-like):  A JSON sidecar needed for HED operations.

    """

    return df.copy()

validate_input_data abstractmethod staticmethod

validate_input_data(parameters)

Validates whether operation parameters meet op-specific criteria beyond that captured in json schema.

Example: A check to see whether two input arrays are the same length.

The minimum implementation should return an empty list to indicate no errors were found.

If additional validation is necessary, method should perform the validation and return a list with user-friendly error strings.

Source code in hed/tools/remodeling/operations/base_op.py
@staticmethod
@abstractmethod
def validate_input_data(parameters):
    """ Validates whether operation parameters meet op-specific criteria beyond that captured in json schema.

    Example: A check to see whether two input arrays are the same length.

    Notes: The minimum implementation should return an empty list to indicate no errors were found.
           If additional validation is necessary, method should perform the validation and
           return a list with user-friendly error strings.
    """
    return []

Remove Columns

remove_columns_op

Remove columns from a columnar file.

RemoveColumnsOp

Bases: BaseOp

Remove columns from a columnar file.

Required remodeling parameters
  • column_names (list): The names of the columns to be removed.
  • ignore_missing (boolean): If True, names in column_names that are not columns in df should be ignored.
Source code in hed/tools/remodeling/operations/remove_columns_op.py
class RemoveColumnsOp(BaseOp):
    """ Remove columns from a columnar file.

    Required remodeling parameters:
        - **column_names** (*list*): The names of the columns to be removed.
        - **ignore_missing** (*boolean*): If True, names in column_names that are not columns in df should be ignored.

    """
    NAME = "remove_columns"

    PARAMS = {
        "type": "object",
        "properties": {
            "column_names": {
                "type": "array",
                "items": {
                    "type": "string"
                },
                "minItems": 1,
                "uniqueItems": True
            },
            "ignore_missing": {
                "type": "boolean"
            }
        },
        "required": [
            "column_names",
            "ignore_missing"
        ],
        "additionalProperties": False
    }

    def __init__(self, parameters):
        """ Constructor for remove columns operation.

        Parameters:
            parameters (dict): Dictionary with the parameter values for required and optional parameters.

        """
        super().__init__(parameters)
        self.column_names = parameters['column_names']
        ignore_missing = parameters['ignore_missing']
        if ignore_missing:
            self.error_handling = 'ignore'
        else:
            self.error_handling = 'raise'

    def do_op(self, dispatcher, df, name, sidecar=None) -> 'pd.DataFrame':
        """ Remove indicated columns from a dataframe.

        Parameters:
            dispatcher (Dispatcher): Manages the operation I/O.
            df (DataFrame): The DataFrame to be remodeled.
            name (str): Unique identifier for the dataframe -- often the original file path.
            sidecar (Sidecar or file-like):  Not needed for this operation.

        Returns:
            pd.DataFrame: A new dataframe after processing.

        :raises KeyError:
            - If ignore_missing is False and a column not in the data is to be removed.

        """
        df_new = df.copy()
        try:
            return df_new.drop(self.column_names, axis=1, errors=self.error_handling)
        except KeyError:
            raise KeyError("MissingColumnCannotBeRemoved",
                           f"{name}: Ignore missing is False but a column in {str(self.column_names)} is "
                           f"not in the data columns [{str(df_new.columns)}]")

    @staticmethod
    def validate_input_data(parameters):
        """ Additional validation required of operation parameters not performed by JSON schema validator. """
        return []

do_op

do_op(dispatcher, df, name, sidecar=None) -> 'pd.DataFrame'

Remove indicated columns from a dataframe.

Parameters:

Name Type Description Default
dispatcher Dispatcher

Manages the operation I/O.

required
df DataFrame

The DataFrame to be remodeled.

required
name str

Unique identifier for the dataframe -- often the original file path.

required
sidecar Sidecar or file - like

Not needed for this operation.

None

Returns:

Type Description
'pd.DataFrame'

pd.DataFrame: A new dataframe after processing.

:raises KeyError: - If ignore_missing is False and a column not in the data is to be removed.

Source code in hed/tools/remodeling/operations/remove_columns_op.py
def do_op(self, dispatcher, df, name, sidecar=None) -> 'pd.DataFrame':
    """ Remove indicated columns from a dataframe.

    Parameters:
        dispatcher (Dispatcher): Manages the operation I/O.
        df (DataFrame): The DataFrame to be remodeled.
        name (str): Unique identifier for the dataframe -- often the original file path.
        sidecar (Sidecar or file-like):  Not needed for this operation.

    Returns:
        pd.DataFrame: A new dataframe after processing.

    :raises KeyError:
        - If ignore_missing is False and a column not in the data is to be removed.

    """
    df_new = df.copy()
    try:
        return df_new.drop(self.column_names, axis=1, errors=self.error_handling)
    except KeyError:
        raise KeyError("MissingColumnCannotBeRemoved",
                       f"{name}: Ignore missing is False but a column in {str(self.column_names)} is "
                       f"not in the data columns [{str(df_new.columns)}]")

validate_input_data staticmethod

validate_input_data(parameters)

Additional validation required of operation parameters not performed by JSON schema validator.

Source code in hed/tools/remodeling/operations/remove_columns_op.py
@staticmethod
def validate_input_data(parameters):
    """ Additional validation required of operation parameters not performed by JSON schema validator. """
    return []

Rename Columns

rename_columns_op

Rename columns in a columnar file.

RenameColumnsOp

Bases: BaseOp

Rename columns in a tabular file.

Required remodeling parameters
  • column_mapping (dict): The names of the columns to be renamed with values to be remapped to.
  • ignore_missing (bool): If true, the names in column_mapping that are not columns and should be ignored.
Source code in hed/tools/remodeling/operations/rename_columns_op.py
class RenameColumnsOp (BaseOp):
    """ Rename columns in a tabular file.

    Required remodeling parameters:
        - **column_mapping** (*dict*): The names of the columns to be renamed with values to be remapped to.
        - **ignore_missing** (*bool*): If true, the names in column_mapping that are not columns and should be ignored.

    """
    NAME = "rename_columns"

    PARAMS = {
        "type": "object",
        "properties": {
            "column_mapping": {
                "type": "object",
                "description": "Mapping between original column names and their respective new names.",
                "patternProperties": {
                    ".*": {
                        "type": "string"
                    }
                },
                "minProperties": 1
            },
            "ignore_missing": {
                "type": "boolean",
                "description": "If true ignore column_mapping keys that don't correspond to columns, otherwise error."
            }
        },
        "required": [
            "column_mapping",
            "ignore_missing"
        ],
        "additionalProperties": False
    }

    def __init__(self, parameters):
        """ Constructor for rename columns operation.

        Parameters:
            parameters (dict): Dictionary with the parameter values for required and optional parameters

        """
        super().__init__(parameters)
        self.column_mapping = parameters['column_mapping']
        if parameters['ignore_missing']:
            self.error_handling = 'ignore'
        else:
            self.error_handling = 'raise'

    def do_op(self, dispatcher, df, name, sidecar=None) -> 'pd.DataFrame':
        """ Rename columns as specified in column_mapping dictionary.

        Parameters:
            dispatcher (Dispatcher): Manages the operation I/O.
            df (DataFrame): The DataFrame to be remodeled.
            name (str): Unique identifier for the dataframe -- often the original file path.
            sidecar (Sidecar or file-like):  Not needed for this operation.

        Returns:
            pd.Dataframe: A new dataframe after processing.

        :raises KeyError:
            - When ignore_missing is False and column_mapping has columns not in the data.

        """
        df_new = df.copy()
        try:
            return df_new.rename(columns=self.column_mapping, errors=self.error_handling)
        except KeyError:
            raise KeyError("MappedColumnsMissingFromData",
                           f"{name}: ignore_missing is False, mapping columns [{self.column_mapping}]"
                           f" but df columns are [{str(df.columns)}")

    @staticmethod
    def validate_input_data(parameters):
        """ Additional validation required of operation parameters not performed by JSON schema validator. """
        return []

do_op

do_op(dispatcher, df, name, sidecar=None) -> 'pd.DataFrame'

Rename columns as specified in column_mapping dictionary.

Parameters:

Name Type Description Default
dispatcher Dispatcher

Manages the operation I/O.

required
df DataFrame

The DataFrame to be remodeled.

required
name str

Unique identifier for the dataframe -- often the original file path.

required
sidecar Sidecar or file - like

Not needed for this operation.

None

Returns:

Type Description
'pd.DataFrame'

pd.Dataframe: A new dataframe after processing.

:raises KeyError: - When ignore_missing is False and column_mapping has columns not in the data.

Source code in hed/tools/remodeling/operations/rename_columns_op.py
def do_op(self, dispatcher, df, name, sidecar=None) -> 'pd.DataFrame':
    """ Rename columns as specified in column_mapping dictionary.

    Parameters:
        dispatcher (Dispatcher): Manages the operation I/O.
        df (DataFrame): The DataFrame to be remodeled.
        name (str): Unique identifier for the dataframe -- often the original file path.
        sidecar (Sidecar or file-like):  Not needed for this operation.

    Returns:
        pd.Dataframe: A new dataframe after processing.

    :raises KeyError:
        - When ignore_missing is False and column_mapping has columns not in the data.

    """
    df_new = df.copy()
    try:
        return df_new.rename(columns=self.column_mapping, errors=self.error_handling)
    except KeyError:
        raise KeyError("MappedColumnsMissingFromData",
                       f"{name}: ignore_missing is False, mapping columns [{self.column_mapping}]"
                       f" but df columns are [{str(df.columns)}")

validate_input_data staticmethod

validate_input_data(parameters)

Additional validation required of operation parameters not performed by JSON schema validator.

Source code in hed/tools/remodeling/operations/rename_columns_op.py
@staticmethod
def validate_input_data(parameters):
    """ Additional validation required of operation parameters not performed by JSON schema validator. """
    return []

BIDS Tools

BIDS Dataset Processing

bids

Models for BIDS datasets and files.