Source code for hed.schema.schema_io.base2schema

"""Abstract base class for loading HED schema files into HedSchema objects."""

import copy

from hed.schema.schema_io import schema_util
from hed.errors.exceptions import HedFileError, HedExceptions

from hed.schema.hed_schema import HedSchema
from hed.schema import hed_schema_constants as constants
from hed.schema.hed_schema_constants import HedKey
from abc import abstractmethod, ABC
from hed.schema import schema_header_util
from hed.schema import hed_schema_constants
from hed.schema.schema_io import df_constants


[docs] class SchemaLoader(ABC): """Baseclass for schema loading, to handle basic errors and partnered schemas Expected usage is SchemaLoaderXML.load(filename) SchemaLoaderXML(filename) will load just the header_attributes """ def __init__(self, filename, schema_as_string=None, schema=None, file_format=None, name=""): """Loads the given schema from one of the two parameters. Parameters: filename(str or None): A valid filepath or None schema_as_string(str or None): A full schema as text or None schema(HedSchema or None): A HED schema to merge this new file into It must be a with-standard schema with the same value. file_format(str or None): The format of this file if needed(only for owl currently) name(str or None): Optional user supplied identifier, by default uses filename """ if schema_as_string and filename: raise HedFileError(HedExceptions.BAD_PARAMETERS, "Invalid parameters to schema creation.", filename) self.file_format = file_format self.filename = filename self.name = name if name else filename self.schema_as_string = schema_as_string self.appending_to_schema = False try: self.input_data = self._open_file() except OSError as e: raise HedFileError(HedExceptions.FILE_NOT_FOUND, e.strerror, self.name) from e except TypeError as e: raise HedFileError(HedExceptions.FILE_NOT_FOUND, str(e), self.name) from e except ValueError as e: raise HedFileError(HedExceptions.FILE_NOT_FOUND, str(e), self.name) from e # self._schema.filename = filename hed_attributes = self._get_header_attributes(self.input_data) schema_header_util.validate_attributes(hed_attributes, name=self.name) with_standard = hed_attributes.get(hed_schema_constants.WITH_STANDARD_ATTRIBUTE, "") self.library = hed_attributes.get(hed_schema_constants.LIBRARY_ATTRIBUTE, "") version_number = hed_attributes.get(hed_schema_constants.VERSION_ATTRIBUTE, "") if not schema: self._schema = HedSchema() else: self._schema = schema self.appending_to_schema = True if not self._schema.with_standard: raise HedFileError( HedExceptions.SCHEMA_LOAD_FAILED, "Loading multiple normal schemas as a merged one with the same namespace. " "Ensure schemas have the withStandard header attribute set", self.name, ) elif with_standard != self._schema.with_standard: raise HedFileError( HedExceptions.SCHEMA_LOAD_FAILED, f"Merging schemas requires same withStandard value ({with_standard} != {self._schema.with_standard}).", self.name, ) hed_attributes[hed_schema_constants.VERSION_ATTRIBUTE] = self._schema.version_number + f",{version_number}" hed_attributes[hed_schema_constants.LIBRARY_ATTRIBUTE] = self._schema.library + f",{self.library}" if name: self._schema.name = name self._schema.filename = filename self._schema.header_attributes = hed_attributes self._loading_merged = False self.fatal_errors = [] @property def schema(self): """The partially loaded schema if you are after just header attributes.""" return self._schema
[docs] @classmethod def load(cls, filename=None, schema_as_string=None, schema=None, file_format=None, name=""): """Loads and returns the schema, including partnered schema if applicable. Parameters: filename(str or None): A valid filepath or None schema_as_string(str or None): A full schema as text or None schema(HedSchema or None): A HED schema to merge this new file into It must be a with-standard schema with the same value. file_format(str or None): If this is an owl file being loaded, this is the format. Allowed values include: turtle, json-ld, and owl(xml) name(str or None): Optional user supplied identifier, by default uses filename Returns: HedSchema: The new schema """ loader = cls(filename, schema_as_string, schema, file_format, name) return loader._load()
def _load(self): """Parses the previously loaded data, including loading a partnered schema if needed. Returns: schema(HedSchema): The new schema """ self._loading_merged = True # Do a full load of the standard schema if this is a partnered schema if not self.appending_to_schema and self._schema.with_standard and not self._schema.merged: from hed.schema.hed_schema_io import load_schema_version saved_attr = self._schema.header_attributes saved_format = self._schema.source_format try: base_version = load_schema_version(self._schema.with_standard) except HedFileError as e: raise HedFileError( HedExceptions.SCHEMA_LIBRARY_INVALID, message=f"Cannot load withStandard schema '{self._schema.with_standard}'", filename=e.filename, ) from e # Copy the non-alterable cached schema self._schema = copy.deepcopy(base_version) self._schema.filename = self.filename self._schema.name = self.name # Manually set name here as we don't want to pass it to load_schema_version self._schema.header_attributes = saved_attr self._schema.source_format = saved_format self._loading_merged = False self._parse_data() self._schema.finalize_dictionaries() self.fix_extras() return self._schema @abstractmethod def _open_file(self): """Overloaded versions should retrieve the input from filename/schema_as_string""" pass @abstractmethod def _get_header_attributes(self, input_data): """Overloaded versions should return the header attributes from the input data.""" pass @abstractmethod def _parse_data(self): """Puts the input data into the new schema""" pass def _add_to_dict_base(self, entry, key_class): if not entry.has_attribute(HedKey.InLibrary) and self.appending_to_schema and self._schema.merged: return None if self.library and ( not self._schema.with_standard or (not self._schema.merged and self._schema.with_standard) ): # only add it if not already present - This is a rare case if not entry.has_attribute(HedKey.InLibrary): entry._set_attribute_value(HedKey.InLibrary, self.library) return self._schema._add_tag_to_dict(entry.name, entry, key_class)
[docs] @staticmethod def find_rooted_entry(tag_entry, schema, loading_merged): """This semi-validates rooted tags, raising an exception on major errors Parameters: tag_entry(HedTagEntry): the possibly rooted tag schema(HedSchema): The schema being loaded loading_merged(bool): If this schema was already merged before loading Returns: Union[HedTagEntry, None]: The base tag entry from the standard schema Returns None if this tag isn't rooted :raises HedFileError: - A rooted attribute is found in a non-paired schema - A rooted attribute is not a string - A rooted attribute was found on a non-root node in an unmerged schema. - A rooted attribute is found on a root node in a merged schema. - A rooted attribute indicates a tag that doesn't exist in the base schema. """ rooted_tag = tag_entry.has_attribute(constants.HedKey.Rooted, return_value=True) if rooted_tag is not None: if not schema.with_standard: raise HedFileError( HedExceptions.SCHEMA_LIBRARY_INVALID, f"Rooted tag attribute found on '{tag_entry.short_tag_name}' in a standard schema.", schema.name, ) if not isinstance(rooted_tag, str): raise HedFileError( HedExceptions.SCHEMA_LIBRARY_INVALID, f"Rooted tag '{tag_entry.short_tag_name}' is not a string.", schema.name, ) if tag_entry.parent_name and not loading_merged: raise HedFileError( HedExceptions.SCHEMA_LIBRARY_INVALID, f"Found rooted tag '{tag_entry.short_tag_name}' as a non root node.", schema.name, ) if not tag_entry.parent_name and loading_merged: raise HedFileError( HedExceptions.SCHEMA_LIBRARY_INVALID, f"Found rooted tag '{tag_entry.short_tag_name}' as a root node in a merged schema.", schema.name, ) rooted_entry = schema.tags.get(rooted_tag) if not rooted_entry or rooted_entry.has_attribute(constants.HedKey.InLibrary): raise HedFileError( HedExceptions.SCHEMA_LIBRARY_INVALID, f"Rooted tag '{tag_entry.short_tag_name}' not found in paired standard schema", schema.name, ) if loading_merged: return None return rooted_entry return None
def _add_fatal_error( self, line_number, line, warning_message="Schema term is empty or the line is malformed", error_code=HedExceptions.WIKI_DELIMITERS_INVALID, ): self.fatal_errors += schema_util.format_error(line_number, line, warning_message, error_code)
[docs] def fix_extras(self): """Fixes the extras after loading the schema, to ensure they are in the correct format.""" if not self._schema or not hasattr(self._schema, "extras") or not self._schema.extras: return for key, extra in self._schema.extras.items(): self._schema.extras[key] = extra.rename(columns=df_constants.EXTRAS_CONVERSIONS) if key in df_constants.extras_column_dict: self._schema.extras[key] = self.fix_extra(key)
[docs] def fix_extra(self, key): """Normalize an extras dataframe by ensuring required columns are present and in canonical order. Parameters: key (str): The extras dict key identifying which extra dataframe to fix. Returns: pd.DataFrame: The normalized dataframe with required columns added and sorted. """ df = self._schema.extras[key] priority_cols = df_constants.extras_column_dict[key] col_to_add = [col for col in priority_cols if col not in df.columns] if col_to_add: df[col_to_add] = "" other_cols = sorted(set(df.columns) - set(priority_cols)) df = df[priority_cols + other_cols] df = df.sort_values(by=list(df.columns)) return df