Source code for hed.models.schema_lookup

"""Schema lookup table for ancestor-aware string search.

Generates a compact mapping from every tag's casefolded short name to its
full ``tag_terms`` tuple (all slash-path components from root to self), which
is the same information stored in :attr:`~hed.schema.HedTagEntry.tag_terms` after
schema loading.

This lookup table is used by :func:`~hed.models.string_search.parse_hed_string`
and :class:`~hed.models.string_search.StringQueryHandler` to enable ancestor search
on short-form HED strings without requiring a full schema object at search time.

Typical workflow::

    from hed.schema import load_schema_version
    from hed.models.schema_lookup import generate_schema_lookup, save_schema_lookup

    schema = load_schema_version("8.4.0")
    lookup = generate_schema_lookup(schema)
    # Optionally persist:
    save_schema_lookup(lookup, "hed_8.4.0_lookup.json")

The lookup dict maps::

    {
        "sensory-event": ("event", "sensory-event"),
        "event": ("event",),
        ...
    }

Keys are casefolded short tag names (last slash-component). Values are tuples
of casefolded path components from the schema root to the tag (inclusive),
matching exactly what :attr:`~hed.schema.HedTagEntry.tag_terms` contains.
"""

from __future__ import annotations

import json
from pathlib import Path


[docs] def generate_schema_lookup(schema): """Build a schema lookup table mapping short tag names to their ``tag_terms``. Walks the tags section of *schema* (or all component schemas in a :class:`~hed.schema.HedSchemaGroup`) and collects each tag's :attr:`~hed.schema.HedTagEntry.tag_terms` tuple, keyed by the tag's casefolded short name. Parameters: schema (HedSchema or HedSchemaGroup): The loaded HED schema. Returns: dict[str, tuple[str, ...]]: Mapping ``short_tag_casefold`` → ``tag_terms`` tuple as stored in the schema entry. Notes: - Tags whose ``/#`` value placeholder end-entry is skipped (they share the parent tag's short name with a trailing ``/#`` which is already stripped by the schema loader). - For :class:`~hed.schema.HedSchemaGroup`, all member schemas are merged; later schemas overwrite earlier ones on key collision. - Library namespace prefixes (e.g. ``"sc:"`` in ``"sc:Event"``) are **not** stripped — include the namespace when searching if needed. """ lookup = {} # Handle HedSchemaGroup by iterating component schemas schemas = _iter_schemas(schema) for sch in schemas: tags_section = sch.tags if tags_section is None: continue for name, entry in tags_section.items(): # Skip value-placeholder entries (e.g. "Event/#") — they are internal if name.endswith("/#"): continue if not hasattr(entry, "tag_terms") or not entry.tag_terms: continue # short_tag_name is the last slash component (already set on HedTagEntry) short_name = getattr(entry, "short_tag_name", None) if short_name is None: # Fall back: last slash component of name short_name = name.rsplit("/", 1)[-1] key = short_name.casefold() lookup[key] = entry.tag_terms # already a tuple of casefolded strings return lookup
def _iter_schemas(schema): """Yield individual HedSchema objects from a schema or schema group. Parameters: schema (HedSchema or HedSchemaGroup): The schema to iterate. Yields: HedSchema: Individual schema objects. """ # HedSchemaGroup stores member schemas in _schemas (a dict keyed by namespace) if hasattr(schema, "_schemas"): yield from schema._schemas.values() else: yield schema
[docs] def save_schema_lookup(lookup, path): """Serialise a schema lookup dict to a JSON file. Values (tuples) are saved as JSON arrays and restored as tuples on load. Parameters: lookup (dict[str, tuple]): The lookup dict from :func:`generate_schema_lookup`. path (str or Path): Destination file path. """ serialisable = {k: list(v) for k, v in lookup.items()} Path(path).write_text(json.dumps(serialisable, indent=2), encoding="utf-8")
[docs] def load_schema_lookup(path): """Load a schema lookup dict previously saved with :func:`save_schema_lookup`. Parameters: path (str or Path): The JSON file to load. Returns: dict[str, tuple[str, ...]]: The schema lookup dict with tuple values. """ raw = json.loads(Path(path).read_text(encoding="utf-8")) return {k: tuple(v) for k, v in raw.items()}