""" Utilities for loading and outputting HED schema. """
from __future__ import annotations
import os
import json
import functools
from typing import Union
from hed.schema.hed_schema import HedSchema
from hed.schema.schema_io.xml2schema import SchemaLoaderXML
from hed.schema.schema_io.wiki2schema import SchemaLoaderWiki
from hed.schema.schema_io.df2schema import SchemaLoaderDF
from hed.schema import hed_cache
from hed.errors.exceptions import HedFileError, HedExceptions
from hed.schema.schema_io import schema_util
from hed.schema.hed_schema_group import HedSchemaGroup
from hed.schema.schema_header_util import validate_version_string
from collections import defaultdict
from urllib.error import URLError
MAX_MEMORY_CACHE = 40
[docs]
def load_schema_version(xml_version=None, xml_folder=None) -> Union['HedSchema', 'HedSchemaGroup']:
""" Return a HedSchema or HedSchemaGroup extracted from xml_version
Parameters:
xml_version (str or list): List or str specifying which official HED schemas to use.
A json str format is also supported,
based on the output of HedSchema.get_formatted_version
Basic format: `[schema_namespace:][library_name_]X.Y.Z`.
xml_folder (str): Path to a folder containing schema.
Returns:
Union[HedSchema, HedSchemaGroup]: The schema or schema group extracted.
Raises:
HedFileError: The xml_version is not valid.
HedFileError: The specified version cannot be found or loaded.
HedFileError: Other fatal errors loading the schema (These are unlikely if you are not editing them locally).
HedFileError: The prefix is invalid.
"""
# Check if we start and end with a square bracket, or double quote. This might be valid json
if xml_version and isinstance(xml_version, str) and \
((xml_version[0], xml_version[-1]) in [('[', ']'), ('"', '"')]):
try:
xml_version = json.loads(xml_version)
except json.decoder.JSONDecodeError as e:
raise HedFileError(HedExceptions.CANNOT_PARSE_JSON, str(e), xml_version) from e
if xml_version and isinstance(xml_version, list):
xml_versions = parse_version_list(xml_version)
schemas = [_load_schema_version(xml_version=version, xml_folder=xml_folder) for version in
xml_versions.values()]
if len(schemas) == 1:
return schemas[0]
name = ",".join([schema.version for schema in schemas])
return HedSchemaGroup(schemas, name=name)
else:
return _load_schema_version(xml_version=xml_version, xml_folder=xml_folder)
[docs]
def load_schema(hed_path, schema_namespace=None, schema=None, name=None) -> 'HedSchema':
""" Load a schema from the given file or URL path.
Parameters:
hed_path (str): A filepath or url to open a schema from.
If loading a TSV file, this should be a single filename where:
Template: basename.tsv, where files are named basename_Struct.tsv, basename_Tag.tsv, etc.
Alternatively, you can point to a directory containing the .tsv files.
schema_namespace (str or None): The name_prefix all tags in this schema will accept.
schema (HedSchema or None): A HED schema to merge this new file into
It must be a with-standard schema with the same value.
name (str or None): User supplied identifier for this schema
Returns:
HedSchema: The loaded schema.
Raises:
HedFileError: Empty path passed.
HedFileError: Unknown extension.
HedFileError: Any fatal issues when loading the schema.
"""
if not hed_path:
raise HedFileError(HedExceptions.FILE_NOT_FOUND, "Empty file path passed to HedSchema.load_file",
filename=hed_path)
is_url = hed_cache._check_if_url(hed_path)
if is_url:
try:
file_as_string = schema_util.url_to_string(hed_path)
except URLError as e:
raise HedFileError(HedExceptions.URL_ERROR, str(e), hed_path) from e
hed_schema = from_string(file_as_string, schema_format=os.path.splitext(hed_path.lower())[1], name=name)
elif hed_path.lower().endswith(".xml"):
hed_schema = SchemaLoaderXML.load(hed_path, schema=schema, name=name)
elif hed_path.lower().endswith(".mediawiki"):
hed_schema = SchemaLoaderWiki.load(hed_path, schema=schema, name=name)
elif hed_path.lower().endswith(".tsv") or os.path.isdir(hed_path):
if schema is not None:
raise HedFileError(HedExceptions.INVALID_HED_FORMAT,
"Cannot pass a schema to merge into spreadsheet loading currently.", filename=name)
hed_schema = SchemaLoaderDF.load_spreadsheet(filenames=hed_path, name=name)
else:
raise HedFileError(HedExceptions.INVALID_EXTENSION, "Unknown schema extension", filename=hed_path)
if schema_namespace:
hed_schema.set_schema_prefix(schema_namespace=schema_namespace)
return hed_schema
[docs]
def from_string(schema_string, schema_format=".xml", schema_namespace=None, schema=None, name=None) -> 'HedSchema':
""" Create a schema from the given string.
Parameters:
schema_string (str): An XML or mediawiki file as a single long string
schema_format (str): The schema format of the source schema string.
Allowed normal values: .mediawiki, .xml
schema_namespace (str, None): The name_prefix all tags in this schema will accept.
schema (HedSchema or None): A HED schema to merge this new file into
It must be a with-standard schema with the same value.
name (str or None): User supplied identifier for this schema
Returns:
HedSchema: The loaded schema.
:raises HedFileError:
- If empty string or invalid extension is passed.
- Other fatal formatting issues with file
Notes:
- The loading is determined by file type.
"""
if not schema_string:
raise HedFileError(HedExceptions.BAD_PARAMETERS, "Empty string passed to HedSchema.from_string",
filename=name)
if isinstance(schema_string, str):
# Replace carriage returns with new lines since this might not be done by the caller
schema_string = schema_string.replace("\r\n", "\n")
if schema_format.endswith(".xml"):
hed_schema = SchemaLoaderXML.load(schema_as_string=schema_string, schema=schema, name=name)
elif schema_format.endswith(".mediawiki"):
hed_schema = SchemaLoaderWiki.load(schema_as_string=schema_string, schema=schema, name=name)
else:
raise HedFileError(HedExceptions.INVALID_EXTENSION, f"Unknown schema extension {schema_format}", filename=name)
if schema_namespace:
hed_schema.set_schema_prefix(schema_namespace=schema_namespace)
return hed_schema
[docs]
def from_dataframes(schema_data, schema_namespace=None, name=None) -> 'HedSchema':
""" Create a schema from the given string.
Parameters:
schema_data (dict of str or None): A dict of DF_SUFFIXES:file_as_string_or_df
Should have an entry for all values of DF_SUFFIXES.
schema_namespace (str, None): The name_prefix all tags in this schema will accept.
name (str or None): User supplied identifier for this schema
Returns:
HedSchema: The loaded schema.
Raises:
HedFileError: If empty/invalid parameters.
Exception: Other fatal I/O or formatting issues.
Notes:
- The loading is determined by file type.
"""
if not schema_data or not isinstance(schema_data, dict):
raise HedFileError(HedExceptions.BAD_PARAMETERS, "Empty or non dict value passed to HedSchema.from_dataframes",
filename=name)
hed_schema = SchemaLoaderDF.load_spreadsheet(schema_as_strings_or_df=schema_data, name=name)
if schema_namespace:
hed_schema.set_schema_prefix(schema_namespace=schema_namespace)
return hed_schema
# If this is actually used, we could easily add other versions/update this one
[docs]
def get_hed_xml_version(xml_file_path) -> str:
""" Get the version number from a HED XML file.
Parameters:
xml_file_path (str): The path to a HED XML file.
Returns:
str: The version number of the HED XML file.
:raises HedFileError:
- There is an issue loading the schema
"""
parser = SchemaLoaderXML(xml_file_path)
return parser.schema.version
[docs]
def parse_version_list(xml_version_list) -> dict:
"""Takes a list of xml versions and returns a dictionary split by prefix
e.g. ["score", "testlib"] will return {"": "score, testlib"}
e.g. ["score", "testlib", "ol:otherlib"] will return {"": "score, testlib", "ol:": "otherlib"}
Parameters:
xml_version_list (list): List of str specifying which HED schemas to use
Returns:
dict: A dictionary of version strings split by prefix.
"""
out_versions = defaultdict(list)
for version in xml_version_list:
schema_namespace = ""
if version and ":" in version:
schema_namespace, _, version = version.partition(":")
if not isinstance(version, str):
raise HedFileError(HedExceptions.SCHEMA_VERSION_INVALID,
f"Must specify schema version by number, found no version on {xml_version_list} schema.",
filename=None)
if version in out_versions[schema_namespace]:
raise HedFileError(HedExceptions.SCHEMA_DUPLICATE_LIBRARY,
f"Attempting to load same library '{version}' twice: {out_versions[schema_namespace]}",
filename=None)
out_versions[schema_namespace].append(version)
out_versions = {key: ",".join(value) if not key else f"{key}:" + ",".join(value) for key, value in
out_versions.items()}
return out_versions
@functools.lru_cache(maxsize=MAX_MEMORY_CACHE)
def _load_schema_version(xml_version=None, xml_folder=None):
""" Return specified version
Parameters:
xml_version (str): HED version format string. Expected format: '[schema_namespace:][library_name_]X.Y.Z'
Further versions can be added comma separated after the version number/library name.
e.g. "lib:library_x.y.z,otherlibrary_x.y.z" loads "library" and "otherlibrary" into "lib:"
The schema namespace must be the same and not repeated if loading multiple merged schemas.
xml_folder (str): Path to a folder containing schema.
Returns:
Union[HedSchema, HedSchemaGroup]: The requested HedSchema object.
Raises:
HedFileError: The xml_version is not valid.
HedFileError: The specified version cannot be found or loaded.
HedFileError: Multiple schemas are being loaded with the same prefix, and they have duplicate tags.
HedFileError: Other fatal errors loading the schema (These are unlikely if you are not editing them locally).
HedFileError: The prefix is invalid.
"""
schema_namespace = ""
name = xml_version
if xml_version:
if ":" in xml_version:
schema_namespace, _, xml_version = xml_version.partition(":")
if xml_version:
xml_versions = xml_version.split(",")
# Add a blank entry to generate an error if we have no xml version
else:
xml_versions = [""]
first_schema = _load_schema_version_sub(xml_versions[0], schema_namespace, xml_folder=xml_folder,
name=name)
filenames = [os.path.basename(first_schema.filename)]
# Collect all duplicate issues for proper error reporting
all_duplicate_issues = []
for version in xml_versions[1:]:
_load_schema_version_sub(version, schema_namespace, xml_folder=xml_folder, schema=first_schema,
name=name)
# Collect duplicate errors when merging schemas in the same namespace
current_filename = os.path.basename(first_schema.filename)
duplicate_name = first_schema.has_duplicates()
if duplicate_name:
# Collect all duplicate issues rather than raising immediately
for section in first_schema._sections.values():
if hasattr(section, 'duplicate_names') and section.duplicate_names:
for dup_name in section.duplicate_names.keys():
issue = {
'code': HedExceptions.SCHEMA_DUPLICATE_NAMES,
'message': f"Duplicate tag {dup_name} found when merging schemas: {filenames}",
'filename': name
}
all_duplicate_issues.append(issue)
filenames.append(current_filename)
# If we found duplicates, raise error with all issues
if all_duplicate_issues:
raise HedFileError(HedExceptions.SCHEMA_DUPLICATE_NAMES,
f"Found {len(all_duplicate_issues)} duplicate tags when merging schemas: {filenames}",
filename=name, issues=all_duplicate_issues)
if first_schema._namespace:
first_schema.set_schema_prefix(first_schema._namespace)
return first_schema
def _load_schema_version_sub(xml_version, schema_namespace="", xml_folder=None, schema=None, name=""):
""" Return specified version(single version only for this one)
Parameters:
xml_version (str): HED version format string. Expected format: '[library_name_]X.Y.Z'
schema_namespace (str): The prefix this will have
xml_folder (str): Path to a folder containing schema
schema (HedSchema or None): A HED schema to merge this new file into.
name (str): User supplied identifier for this schema
Returns:
HedSchema: The requested HedSchema object.
Raises:
HedFileError: For the following issues:
- The xml_version is not valid.
- The specified version cannot be found or loaded
- Other fatal errors loading the schema (These are unlikely if you are not editing them locally)
- The prefix is invalid
"""
if not xml_version:
xml_version = "8.3.0"
# Parse library name from version string before validation
library_name = ""
version_to_validate = xml_version
if "_" in xml_version:
library_name, _, version_to_validate = xml_version.partition("_")
# Validate the version string format
validation_error = validate_version_string(version_to_validate)
if validation_error:
raise HedFileError(HedExceptions.SCHEMA_VERSION_INVALID,
f"Invalid version format '{version_to_validate}': {validation_error}", xml_version)
hed_file_path = hed_cache.get_hed_version_path(version_to_validate, library_name=library_name, local_hed_directory=xml_folder)
if hed_file_path:
hed_schema = load_schema(hed_file_path, schema_namespace=schema_namespace, schema=schema, name=name)
else:
library_string = f"for library '{library_name}'" if library_name else ""
known_versions = hed_cache.get_hed_versions(xml_folder, library_name=library_name if library_name else "all")
raise HedFileError(HedExceptions.FILE_NOT_FOUND,
f"HED version {library_string}: '{version_to_validate}' not found. Check {hed_cache.get_cache_directory(xml_folder)} for cache or https://github.com/hed-standard/hed-schemas/tree/main/library_schemas. "
f"Known versions {library_string}: {known_versions}.", '')
return hed_schema