Source code for hed.tools.bids.bids_file_group

""" A group of BIDS files with specified suffix name. """

import os
import pandas as pd

from hed.errors.error_reporter import ErrorHandler
from hed.validator.sidecar_validator import SidecarValidator
from hed.tools.analysis.tabular_summary import TabularSummary
from hed.tools.bids.bids_tabular_file import BidsTabularFile
from hed.tools.bids.bids_sidecar_file import BidsSidecarFile
from hed.tools.util import io_util


[docs] class BidsFileGroup: """ Container for BIDS files with a specified suffix. Attributes: suffix (str): The file suffix specifying the class of file represented in this group (e.g., events). sidecar_dict (dict): A dictionary of sidecars associated with this suffix . datafile_dict (dict): A dictionary with values either BidsTabularFile or BidsTimeseriesFile. sidecar_dir_dict (dict): Dictionary whose keys are directory paths and values are list of sidecars in the corresponding directory. """
[docs] def __init__(self, root_path, file_list, suffix="events"): """ Constructor for a BidsFileGroup. Parameters: file_list (list): List of paths to the relevant tsv and json files. suffix (str): Suffix indicating the type this group represents (e.g. events, or channels, etc.). """ self.suffix = suffix ext_dict = io_util.separate_by_ext(file_list) self.bad_files = {} self.sidecar_dict = {} self.sidecar_dir_dict = {} self.datafile_dict = {} self.has_hed = False self._make_sidecar_dict(ext_dict.get('.json', [])) self._make_dir_dict(root_path) self._make_datafile_dict(root_path, ext_dict.get('.tsv', []))
[docs] def summarize(self, value_cols=None, skip_cols=None): """ Return a BidsTabularSummary of group files. Parameters: value_cols (list): Column names designated as value columns. skip_cols (list): Column names designated as columns to skip. Returns: Union[TabularSummary, None]: A summary of the number of values in different columns if tabular group. Notes: - The columns that are not value_cols or skip_col are summarized by counting the number of times each unique value appears in that column. """ info = TabularSummary(value_cols=value_cols, skip_cols=skip_cols) info.update(list(self.datafile_dict.keys())) return info
[docs] def validate(self, hed_schema, extra_def_dicts=None, check_for_warnings=False): """ Validate the sidecars and datafiles and return a list of issues. Parameters: hed_schema (HedSchema): Schema to apply to the validation. extra_def_dicts (DefinitionDict): Extra definitions that come from outside. check_for_warnings (bool): If True, include warnings in the check. Returns: list: A list of validation issues found. Each issue is a dictionary. """ error_handler = ErrorHandler(check_for_warnings) issues = [] issues += self.validate_sidecars(hed_schema, extra_def_dicts=extra_def_dicts, error_handler=error_handler) issues += self.validate_datafiles(hed_schema, extra_def_dicts=extra_def_dicts, error_handler=error_handler) return issues
[docs] def validate_sidecars(self, hed_schema, extra_def_dicts=None, error_handler=None): """ Validate merged sidecars. Parameters: hed_schema (HedSchema): HED schema for validation. extra_def_dicts (DefinitionDict): Extra definitions. error_handler (ErrorHandler): Error handler to use. Returns: list: A list of validation issues found. Each issue is a dictionary. """ if not error_handler: error_handler = ErrorHandler(False) issues = [] validator = SidecarValidator(hed_schema) for sidecar in self.sidecar_dict.values(): issues += validator.validate(sidecar.contents, extra_def_dicts=extra_def_dicts, name=sidecar.file_path, error_handler=error_handler) return issues
[docs] def validate_datafiles(self, hed_schema, extra_def_dicts=None, error_handler=None): """ Validate the datafiles and return an error list. Parameters: hed_schema (HedSchema): Schema to apply to the validation. extra_def_dicts (DefinitionDict): Extra definitions that come from outside. error_handler (ErrorHandler): Error handler to use. Returns: list: A list of validation issues found. Each issue is a dictionary. Notes: This will clear the contents of the datafiles if they were not previously set. """ if not error_handler: error_handler = ErrorHandler(False) issues = [] for data_obj in self.datafile_dict.values(): if not data_obj.has_hed: continue had_contents = data_obj.contents data_obj.set_contents(overwrite=False) issues += data_obj.contents.validate(hed_schema, extra_def_dicts=extra_def_dicts, name=data_obj.file_path, error_handler=error_handler) if not had_contents: data_obj.clear_contents() return issues
def _make_dir_dict(self, root_path): """ Create dictionary directory paths keys and assign to self.sidecar_dir_dict. Parameters: root_path (str): The root path of the BIDS dataset. Note: Creates dictionary with directories as keys and list of sidecars in that directory as values. """ self.sidecar_dir_dict = {} for root, dirs, files in os.walk(root_path, topdown=True): sidecar_list = [] for r_file in files: file_path = os.path.join(os.path.realpath(root), r_file) if file_path in self.sidecar_dict: sidecar_list.append(file_path) if not sidecar_list: continue self.sidecar_dir_dict[os.path.realpath(root)] = sidecar_list def _make_datafile_dict(self, root_path, tsv_list): """ Sets the dictionary of BIDS Tabular file objects for the give list of tabular files. Parameters: root_path (str): The root path of the BIDS dataset. tsv_list (list): A list of paths to the tabular files. """ self.datafile_dict = {} for file_path in tsv_list: tsv_obj = BidsTabularFile(file_path) if os.path.getsize(file_path) == 0: continue if tsv_obj.bad: self.bad_files[file_path] = f"{file_path} violates BIDS naming convention for {str(tsv_obj.bad)}" continue tsv_obj.set_sidecar(self._get_tsv_sidecar(root_path, tsv_obj)) try: column_headers = list(pd.read_csv(file_path, sep='\t', nrows=0).columns) except Exception as e: self.bad_files[file_path] = f"{file_path} does not have a valid column header: {str(e)}" continue if "HED" in column_headers or "HED_assembled" in column_headers or tsv_obj.sidecar: self.has_hed = True tsv_obj.has_hed = True self.datafile_dict[os.path.realpath(file_path)] = tsv_obj def _get_tsv_sidecar(self, root_path, tsv_obj): """ Return the merged Sidecar for the tsv_obj Parameters: root_path (str): The root path of the BIDS dataset. tsv_obj (BidsTabularFile): The BIDS tabular file to get the sidecars for. Returns: Union[Sidecar, None]: The merged Sidecar for the tsv_obj, if any. """ path_components = [root_path] + io_util.get_path_components(root_path, tsv_obj.file_path) sidecar_list = [] current_path = '' for comp in path_components: current_path = os.path.realpath(os.path.join(current_path, comp)) candidate = self._get_sidecar_for_obj(tsv_obj, current_path) if candidate: sidecar_list.append(candidate) if len(sidecar_list) > 1: merged_name = "merged_" + io_util.get_basename(tsv_obj.file_path) + '.json' return BidsSidecarFile.merge_sidecar_list(sidecar_list, name=merged_name) elif len(sidecar_list) == 1: return sidecar_list[0].contents return None def _get_sidecar_for_obj(self, tsv_obj, current_path): """ Return a single BidsSidecarFile relevant to obj from the sidecars in the current path. Parameters: tsv_obj (BidsTabularFile): A file whose sidecars are to be found. current_path (str): The path of the directory whose sidecars are to be checked. Returns: Union[BidsSidecarFile, None]: The BidsSidecarFile in current_path relevant to obj, if any. """ sidecar_paths = self.sidecar_dir_dict.get(current_path, []) if not sidecar_paths: return None candidates = [] for sidecar_path in sidecar_paths: sidecar = self.sidecar_dict[sidecar_path] if sidecar.is_sidecar_for(tsv_obj): candidates.append(sidecar) if len(candidates) == 1: return candidates[0] elif len(candidates) > 1: for candidate in candidates: self.bad_files[candidate] = f"Sidecar {str(candidate.file_path)} conflicts with other sidecars for {tsv_obj.file_path} in {current_path}" return None return None def _make_sidecar_dict(self, json_files): """ Create a dictionary of BidsSidecarFile objects for the specified entity type and set contents. Parameters: json_files (list): A list of paths to the json files. Notes: sets the dict: a dictionary of BidsSidecarFile objects keyed by real path for the specified suffix type. """ self.sidecar_dict = {} for file_path in json_files: if os.path.getsize(file_path) == 0: continue sidecar_file = BidsSidecarFile(os.path.realpath(file_path)) if sidecar_file.bad: self.bad_files[file_path] = f"{file_path} violates BIDS naming convention for {str(sidecar_file.bad)}" continue sidecar_file.set_contents(overwrite=False) if sidecar_file.has_hed: self.sidecar_dict[os.path.realpath(file_path)] = sidecar_file self.has_hed = True
[docs] @staticmethod def create_file_group(root_path, file_list, suffix): file_group = BidsFileGroup(root_path, file_list, suffix=suffix) if not file_group.sidecar_dict and not file_group.datafile_dict: return None return file_group