"""Utilities for generating and handling file names."""
import os
import re
from datetime import datetime
TIME_FORMAT = "%Y_%m_%d_T_%H_%M_%S_%f"
[docs]
def check_filename(test_file, name_prefix=None, name_suffix=None, extensions=None):
"""Return True if correct extension, suffix, and prefix.
Parameters:
test_file (str): Path of filename to test.
name_prefix (list, str, None): An optional name_prefix or list of prefixes to accept for the base filename.
name_suffix (list, str, None): An optional name_suffix or list of suffixes to accept for the base file name.
extensions (list, str, None): An optional extension or list of extensions to accept for the extensions.
Returns:
bool: True if file has the appropriate format.
Notes:
- Everything is converted to lower case prior to testing so this test should be case-insensitive.
- None indicates that all are accepted.
"""
basename = os.path.basename(test_file.lower())
if name_prefix and not get_allowed(basename, allowed_values=name_prefix, starts_with=True):
return False
if extensions:
ext = get_allowed(basename, allowed_values=extensions, starts_with=False)
if not ext:
return False
basename = basename[: -len(ext)]
else:
basename = os.path.splitext(basename)[0]
if name_suffix and not get_allowed(basename, allowed_values=name_suffix, starts_with=False):
return False
return True
[docs]
def get_allowed(value, allowed_values=None, starts_with=True):
"""Return the portion of the value that matches a value in allowed_values or None if no match.
Parameters:
value (str): value to be matched.
allowed_values (list, str, or None): Values to match.
starts_with (bool): If True match is done at beginning of string, otherwise the end.
Returns:
Union[str,list]: portion of value that matches the various allowed_values.
Notes:
- match is done in lower case.
"""
if not allowed_values:
return value
elif not isinstance(allowed_values, list):
allowed_values = [allowed_values]
allowed_values = [item.lower() for item in allowed_values]
lower_value = value.lower()
if starts_with:
result = list(filter(lower_value.startswith, allowed_values))
else:
result = list(filter(lower_value.endswith, allowed_values))
if result:
result = result[0]
return result
[docs]
def get_alphanumeric_path(pathname, replace_char="_"):
"""Replace sequences of non-alphanumeric characters in string (usually a path) with specified character.
Parameters:
pathname (str): A string usually representing a pathname, but could be any string.
replace_char (str): Replacement character(s).
Returns:
str: New string with characters replaced.
"""
return re.sub(r"[^a-zA-Z0-9]+", replace_char, pathname)
[docs]
def get_full_extension(filename):
"""Return the full extension of a file, including the period.
Parameters:
filename (str): The filename to be parsed.
Returns:
Tuple[str, str]:
- File name without extension
- Full extension
"""
name, ext = os.path.splitext(filename)
full_ext = ext
while ext: # Keep splitting if there's another extension
name, ext = os.path.splitext(name)
if not ext:
break
full_ext = ext + full_ext
return name, full_ext
[docs]
def get_unique_suffixes(file_paths, extensions=None):
"""Get unique suffixes from file paths with specified extensions.
Parameters:
file_paths (list): List of file paths to process.
extensions (list or None): List of file extensions to filter. If None, defaults to ['.json', '.tsv'].
Returns:
set: Set of unique suffixes found.
"""
if extensions is None:
extensions = [".json", ".tsv"]
suffixes = set()
extension_set = set(extensions)
for file_path in file_paths:
name, ext = get_full_extension(file_path)
if ext not in extension_set:
continue
result = os.path.basename(name).split("_")
if len(result) == 2:
suffixes.add(result[1])
return suffixes
[docs]
def clean_filename(filename):
"""Replace invalid characters with under-bars.
Parameters:
filename (str): source filename.
Returns:
str: The filename with anything but alphanumeric, period, hyphens, and under-bars removed.
"""
if not filename:
return ""
out_name = re.sub(r"[^a-zA-Z0-9._-]+", "_", filename)
return out_name
[docs]
def get_basename(file_path):
"""Return the base filename (without extension) for the given path.
Parameters:
file_path (str): Path to a file.
Returns:
str: The filename stem, e.g. ``sub-01_task-rest_events`` for ``sub-01_task-rest_events.tsv``.
"""
return get_full_extension(file_path)[0]
[docs]
def get_filtered_by_element(file_list, elements):
"""Filter a file list by whether the base names have a substring matching any of the members of elements.
Parameters:
file_list (list): List of file paths to be filtered.
elements (list): List of strings to use as filename filters.
Returns:
list: The list only containing file paths whose filenames match a filter.
"""
new_list = [file for file in file_list if any(substring in os.path.basename(file) for substring in elements)]
return new_list
[docs]
def get_filtered_list(file_list, name_prefix=None, name_suffix=None, extensions=None):
"""Get list of filenames satisfying the criteria.
Everything is converted to lower case prior to testing so this test should be case-insensitive.
Parameters:
file_list (list): List of files to test.
name_prefix (str): Optional name_prefix for the base filename.
name_suffix (str): Optional name_suffix for the base filename.
extensions (list): Optional list of file extensions (allows two periods (.tsv.gz)).
Returns:
list: The filtered file names.
"""
filtered_files = [
file
for file in file_list
if check_filename(file, name_prefix=name_prefix, name_suffix=name_suffix, extensions=extensions)
]
return filtered_files
[docs]
def get_file_list(root_path, name_prefix=None, name_suffix=None, extensions=None, exclude_dirs=None):
"""Return paths satisfying various conditions.
Parameters:
root_path (str): Full path of the directory tree to be traversed (no ending slash).
name_prefix (list, str, None): An optional prefix for the base filename.
name_suffix (list, str, None): An optional suffix for the base filename.
extensions (list, None): A list of extensions to be selected.
exclude_dirs (list, None): A list of paths to be excluded.
Returns:
list: The full paths.
Notes: Exclude directories are paths relative to the root path.
"""
file_list = []
if not exclude_dirs:
exclude_dirs = []
for root, dirs, files in os.walk(root_path, topdown=True):
dirs[:] = [d for d in dirs if d not in exclude_dirs]
for r_file in files:
if check_filename(r_file, name_prefix, name_suffix, extensions):
file_list.append(os.path.realpath(os.path.join(root, r_file)))
return file_list
[docs]
def get_path_components(root_path, this_path):
"""Get a list of the remaining components after root path.
Parameters:
root_path (str): A path (no trailing separator).
this_path (str): The path of a file or directory descendant of root_path.
Returns:
Union[list, None]: A list with the remaining elements directory components to the file.
Notes: this_path must be a descendant of root_path.
"""
base_path = os.path.normpath(os.path.realpath(root_path))
cur_path = os.path.normpath(os.path.realpath(this_path))
common_prefix = os.path.commonprefix([base_path, cur_path])
if not common_prefix:
raise ValueError("NoPathInCommon", f"Paths {base_path} and {cur_path} must have items in common")
common_path = os.path.commonpath([base_path, cur_path])
if common_path != base_path:
return None
rel_path = os.path.relpath(cur_path, base_path)
the_dir = os.path.dirname(rel_path)
if the_dir:
return os.path.normpath(the_dir).split(os.sep)
else:
return []
[docs]
def get_timestamp():
"""Return a timestamp string suitable for using in filenames.
Returns:
str: Represents the current time.
"""
now = datetime.now()
return now.strftime(TIME_FORMAT)[:-3]
[docs]
def get_task_from_file(file_path):
"""Returns the task name entity from a BIDS-type file path.
Parameters:
file_path (str): File path.
Returns:
str: The task name or an empty string.
"""
filename = os.path.splitext(os.path.basename(file_path))
basename = filename[0].strip()
position = basename.lower().find("task-")
if position == -1:
return ""
splits = re.split(r"[_.]", basename[position + 5 :])
return splits[0]
[docs]
def get_task_dict(files):
"""Return a dictionary of the tasks that appear in the file names of a list of files.
Parameters:
files (list): List of filenames to be separated by task.
Returns:
dict: dictionary of filenames keyed by task name.
"""
task_dict = {}
for my_file in files:
task = get_task_from_file(my_file)
if not task:
continue
task_entry = task_dict.get(task, [])
task_entry.append(my_file)
task_dict[task] = task_entry
return task_dict
[docs]
def separate_by_ext(file_paths):
"""Separate a list of files into tsv and json files.
Parameters:
file_paths (list): A list of file paths.
Returns:
dict: key is extension and value is list of files with that extension.
"""
ext_dict = {}
for file_path in file_paths:
basename, ext = get_full_extension(file_path)
if ext not in ext_dict:
ext_dict[ext] = [file_path]
else:
ext_dict[ext].append(file_path)
return ext_dict