"""Allows output of HedSchema objects as .tsv format"""
from hed.schema.hed_schema_constants import HedSectionKey, HedKey
from hed.schema.schema_io.df_util import (
create_empty_dataframes,
get_library_name_and_id,
)
from hed.schema.schema_io.schema2base import Schema2Base
import pandas as pd
import hed.schema.schema_io.df_constants as constants
from hed.schema.hed_schema_entry import HedTagEntry
section_key_to_df = {
HedSectionKey.Tags: constants.TAG_KEY,
HedSectionKey.Units: constants.UNIT_KEY,
HedSectionKey.UnitClasses: constants.UNIT_CLASS_KEY,
HedSectionKey.UnitModifiers: constants.UNIT_MODIFIER_KEY,
HedSectionKey.ValueClasses: constants.VALUE_CLASS_KEY,
HedSectionKey.Attributes: HedSectionKey.Attributes,
HedSectionKey.Properties: HedSectionKey.Properties,
}
[docs]
class Schema2DF(Schema2Base):
"""Converts a HedSchema to a set of pandas DataFrames, one per schema section."""
def __init__(self):
"""Constructor for schema to dataframe converter"""
super().__init__()
self._suffix_rows = {v: [] for v in constants.DF_SUFFIXES}
def _get_object_name_and_id(self, object_name, include_prefix=False):
"""Get the adjusted name and ID for the given object type.
Parameters:
object_name(str): The name of the base HED object, e.g. HedHeader, HedUnit
include_prefix(bool): If True, include the "hed:".
Returns:
tuple[str, str]: A tuple containing:
- The inherited object name, e.g. StandardHeader.
- The full formatted hed_id.
"""
prefix, obj_id = get_library_name_and_id(self._schema)
name = f"{prefix}{object_name.removeprefix('Hed')}"
full_hed_id = self._get_object_id(object_name, obj_id, include_prefix)
return name, full_hed_id
def _get_object_id(self, object_name, base_id=0, include_prefix=False):
prefix = ""
if include_prefix:
prefix = "hed:"
return f"{prefix}HED_{base_id + constants.struct_base_ids[object_name]:07d}"
# =========================================
# Required baseclass function
# =========================================
def _initialize_output(self):
self.output = create_empty_dataframes()
self._suffix_rows = {v: [] for v in constants.DF_SUFFIXES}
def _create_and_add_object_row(self, base_object, attributes="", description=""):
name, full_hed_id = self._get_object_name_and_id(base_object)
new_row = {
constants.hed_id: full_hed_id,
constants.name: name,
constants.attributes: attributes,
constants.subclass_of: base_object,
constants.dcdescription: description.replace("\n", "\\n"),
}
self.output[constants.STRUCT_KEY].loc[len(self.output[constants.STRUCT_KEY])] = new_row
def _output_header(self, attributes):
base_object = "HedHeader"
attributes_string = self._get_attribs_string_from_schema(attributes, sep=", ")
self._create_and_add_object_row(base_object, attributes_string)
def _output_prologue(self, prologue):
base_object = "HedPrologue"
self._create_and_add_object_row(base_object, description=prologue)
def _output_annotations(self, hed_schema):
# This is taken care of in the extras section
pass
def _output_extras(self, hed_schema):
"""Make sure that the extras files have at least a header.
Parameters:
hed_schema(HedSchema): The HED schema to extract the information from
"""
for key, df in hed_schema.extras.items():
self.output[key] = df.copy()
def _output_epilogue(self, epilogue):
base_object = "HedEpilogue"
self._create_and_add_object_row(base_object, description=epilogue)
def _output_footer(self):
# This is not needed for the dataframe output
pass
def _start_section(self, key_class):
# This is not needed for the dataframe output
pass
def _end_tag_section(self):
if self._suffix_rows[constants.TAG_KEY]:
self.output[constants.TAG_KEY] = pd.DataFrame(self._suffix_rows[constants.TAG_KEY], dtype=str)
def _end_units_section(self):
if self._suffix_rows[constants.UNIT_KEY]:
self.output[constants.UNIT_KEY] = pd.DataFrame(self._suffix_rows[constants.UNIT_KEY], dtype=str)
if self._suffix_rows[constants.UNIT_CLASS_KEY]:
self.output[constants.UNIT_CLASS_KEY] = pd.DataFrame(self._suffix_rows[constants.UNIT_CLASS_KEY], dtype=str)
def _end_section(self, section_key):
"""Updates the output with the current values from the section
Parameters:
section_key (HedSectionKey): The section key to end.
"""
suffix_keys = constants.section_key_to_suffixes.get(section_key, [])
for suffix_key in suffix_keys:
if suffix_key in self._suffix_rows and self._suffix_rows[suffix_key]:
self.output[suffix_key] = pd.DataFrame(self._suffix_rows[suffix_key], dtype=str)
def _write_tag_entry(self, tag_entry, parent_node=None, level=0):
tag_id = tag_entry.attributes.get(HedKey.HedID, "")
new_row = {
constants.hed_id: f"{tag_id}",
constants.level: f"{level}",
constants.name: tag_entry.short_tag_name
if not tag_entry.name.endswith("#")
else tag_entry.short_tag_name + "-#",
constants.subclass_of: self._get_subclass_of(tag_entry),
constants.attributes: self._format_tag_attributes(tag_entry.attributes),
constants.dcdescription: tag_entry.description,
}
self._suffix_rows[constants.TAG_KEY].append(new_row)
def _write_entry(self, entry, parent_node, include_props=True):
"""Produce a dictionary for a single row for a non-tag HedSchemaEntry object.
Parameters:
entry (HedSchemaEntry): The HedSchemaEntry object to write.
parent_node (str): The parent node of the entry.
include_props (bool): Whether to include properties in the output.
Returns:
"""
df_key = section_key_to_df.get(entry.section_key)
if not df_key:
return
# Special case
if df_key == HedSectionKey.Properties:
return self._write_property_entry(entry)
elif df_key == HedSectionKey.Attributes:
return self._write_attribute_entry(entry, include_props=include_props)
tag_id = entry.attributes.get(HedKey.HedID, "")
new_row = {
constants.hed_id: f"{tag_id}",
constants.name: entry.name,
constants.subclass_of: self._get_subclass_of(entry),
constants.attributes: self._format_tag_attributes(entry.attributes),
constants.dcdescription: entry.description,
}
# Handle the special case of units, which have the extra unit class
if hasattr(entry, "unit_class_entry"):
class_entry_name = entry.unit_class_entry.name
new_row[constants.has_unit_class] = class_entry_name
self._suffix_rows[df_key].append(new_row)
def _write_attribute_entry(self, entry, include_props):
df_key = constants.OBJECT_KEY
property_type = "ObjectProperty"
if HedKey.AnnotationProperty in entry.attributes:
df_key = constants.ANNOTATION_KEY
property_type = "AnnotationProperty"
elif (
HedKey.NumericRange in entry.attributes
or HedKey.StringRange in entry.attributes
or HedKey.BoolRange in entry.attributes
):
df_key = constants.DATA_KEY
property_type = "DataProperty"
tag_id = entry.attributes.get(HedKey.HedID, "")
new_row = {
constants.hed_id: f"{tag_id}",
constants.name: entry.name,
constants.property_type: property_type,
constants.properties: self._format_tag_attributes(entry.attributes) if include_props else "",
constants.dcdescription: entry.description,
}
self._suffix_rows[df_key].append(new_row)
def _write_property_entry(self, entry):
"""Updates self.classes with the AttributeProperty
Parameters:
entry (HedSchemaEntry): entry with property type AnnotationProperty
"""
# df_key = constants.ATTRIBUTE_PROPERTY_KEY
property_type = "AnnotationProperty"
# df = self.output[df_key]
tag_id = entry.attributes.get(HedKey.HedID, "")
new_row = {
constants.hed_id: f"{tag_id}",
constants.name: entry.name,
constants.property_type: property_type,
constants.dcdescription: entry.description,
}
self._suffix_rows[constants.ATTRIBUTE_PROPERTY_KEY].append(new_row)
def _attribute_disallowed(self, attribute):
if super()._attribute_disallowed(attribute):
return True
# strip out hedID in dataframe format
return attribute in [HedKey.HedID, HedKey.AnnotationProperty]
def _get_subclass_of(self, tag_entry):
# Special case for HedTag
if isinstance(tag_entry, HedTagEntry):
return tag_entry.parent.short_tag_name if tag_entry.parent else "HedTag"
base_objects = {
HedSectionKey.Units: "HedUnit",
HedSectionKey.UnitClasses: "HedUnitClass",
HedSectionKey.UnitModifiers: "HedUnitModifier",
HedSectionKey.ValueClasses: "HedValueClass",
}
name, obj_id = self._get_object_name_and_id(base_objects[tag_entry.section_key], include_prefix=False)
return name