"""Lightweight HED search operating directly on raw HED strings.
Provides a string-based search system that sits between the fast but limited
:mod:`basic_search` (flat regex on pandas Series) and the full
:class:`~hed.models.QueryHandler` (requires :class:`~hed.models.HedString` objects
with a loaded schema).
Key classes:
- :class:`StringNode` — minimal tree node built from a raw HED string at parse
time; duck-types both ``HedGroup`` and ``HedTag`` so that the existing
:class:`~hed.models.query_expressions.Expression` subclasses can evaluate
against it without modification.
- :class:`StringQueryHandler` — subclasses :class:`~hed.models.QueryHandler`,
overriding :meth:`search` to accept a raw string instead of a
:class:`~hed.models.HedString`.
Ancestor search support
-----------------------
Without a *schema lookup table* (see :mod:`hed.models.schema_lookup`), bare term
queries fall back to literal (casefold) matching. To enable ancestor search on
short-form strings supply a ``schema_lookup`` dict generated by
:func:`~hed.models.schema_lookup.generate_schema_lookup`::
from hed.models.schema_lookup import generate_schema_lookup
from hed.models.string_search import StringQueryHandler
lookup = generate_schema_lookup(schema)
handler = StringQueryHandler("Event")
results = handler.search("Sensory-event", schema_lookup=lookup)
Without a lookup, long-form strings support ancestor search for free because
``"Event/Sensory-event"`` is parsed to ``tag_terms = ("event", "sensory-event")``,
so a bare query ``"Event"`` matches it via the standard ``term in tag_terms`` check.
"""
from __future__ import annotations
from collections import deque
from hed.models.query_handler import QueryHandler
from hed.models.hed_string import HedString
[docs]
class StringNode:
"""Lightweight tree node representing a parsed fragment of a raw HED string.
A single ``StringNode`` acts as both a *group* (when ``is_group=True``,
analogous to :class:`~hed.models.HedGroup`) and a *tag leaf*
(when ``is_group=False`` and the node is a direct child of a group,
analogous to :class:`~hed.models.HedTag`). The root node is always
treated as the top-level string container (analogous to
:class:`~hed.models.HedString`): it is never parenthesised
(``is_group=False``) but still participates in group traversal.
Duck-typing contract with the expression evaluation layer:
- ``is_group`` — bool.
- ``_parent`` — parent ``StringNode`` or ``None``.
- ``children`` — direct child nodes (tags and groups).
- ``tags()`` — direct child tag (non-group leaf) nodes.
- ``groups()`` — direct child group nodes.
- ``get_all_groups()`` — all group-like nodes in the subtree, including self.
- ``get_all_tags()`` — all leaf tag nodes in the subtree.
- ``find_tags_with_term(term, ...)`` — ancestor-aware tag search.
- ``find_exact_tags(exact_tags, ...)`` — casefold-exact tag search.
- ``find_wildcard_tags(search_tags, ...)`` — prefix tag search.
- ``tag_terms`` — tuple of casefolded ancestry components (set on leaves).
- ``short_tag`` — casefolded tag text (set on leaves), used by wildcard search.
Parameters:
text (str or None): Casefolded tag text for leaf nodes; ``None`` for
the root node.
is_group (bool): ``True`` if this node represents a parenthesised group.
parent (StringNode or None): The parent node.
depth (int): Nesting depth (root = 0).
schema_lookup (dict or None): Optional schema lookup dict
(:func:`~hed.models.schema_lookup.generate_schema_lookup`) used to
populate ``tag_terms`` for ancestor search.
"""
def __init__(self, text=None, is_group=False, parent=None, depth=0, schema_lookup=None):
self.text = text # casefolded; only meaningful for leaf (non-group, non-root) nodes
self.is_group = is_group
self._parent = parent
self.depth = depth
self.children = []
# --- HedTag duck-typing attributes (set on leaf tag nodes) ---
if text is not None and not is_group:
# short_tag: full casefolded tag text (including any slash/value component).
# Mirrors HedTag.short_tag which also includes the value, e.g. "Def/DefName".
# Used by find_wildcard_tags prefix matching: "Def/Def*" → prefix "def/def",
# checks text.startswith("def/def") rather than only the last component.
self.short_tag = text
# tag_terms: tuple of casefolded path components; used by find_tags_with_term.
# For schema lookup, derive from the last slash component (the short tag name).
_short_name = text.rsplit("/", 1)[-1]
if schema_lookup is not None:
# Use lookup table: maps short_name_casefold → tuple(all_ancestor_terms)
self.tag_terms = schema_lookup.get(_short_name, (_short_name,))
else:
# Derive tag_terms from slash-separated path (handles long-form for free).
# "event/sensory-event" → ("event", "sensory-event") so bare "Event" matches.
self.tag_terms = tuple(text.split("/"))
else:
self.short_tag = ""
self.tag_terms = ()
# ------------------------------------------------------------------
# HedGroup duck-typing interface
# ------------------------------------------------------------------
[docs]
def groups(self):
"""Return direct child group nodes.
Returns:
list[StringNode]: Direct children that are parenthesised groups.
"""
return [c for c in self.children if c.is_group]
[docs]
def get_all_groups(self):
"""Return all group-like nodes in this subtree, including self.
Mirrors :meth:`HedGroup.get_all_groups` which always includes the
receiver (even when the receiver is a :class:`~hed.models.HedString`
with ``is_group=False``).
Returns:
list[StringNode]: All group-like nodes, self first.
"""
result = [self]
stack = list(self.children)
while stack:
node = stack.pop()
if node.is_group:
result.append(node)
stack.extend(node.children)
return result
# ------------------------------------------------------------------
# HedTag duck-typing interface
# ------------------------------------------------------------------
def __eq__(self, other):
"""Compare with another StringNode or a string.
When *other* is a string, compares ``self.text`` (already casefolded)
against ``other.casefold()``. This mirrors ``HedTag.__eq__`` and is
required so that ``tag in exact_tags`` works in :meth:`find_exact_tags`.
Parameters:
other (StringNode or str): The value to compare against.
Returns:
bool: True if equal.
"""
if isinstance(other, str):
return self.text == other.casefold()
if isinstance(other, StringNode):
return self is other
return NotImplemented
def __hash__(self):
return id(self)
def __str__(self):
if self.is_group:
return "(" + ",".join(str(c) for c in self.children) + ")"
if self.text is None:
return ",".join(str(c) for c in self.children)
return self.text
def __repr__(self):
kind = "group" if self.is_group else ("root" if self.text is None else "tag")
return f"StringNode({kind}, text={self.text!r}, children={len(self.children)})"
# ---------------------------------------------------------------------------
# Parser
# ---------------------------------------------------------------------------
[docs]
def parse_hed_string(raw_string, schema_lookup=None):
"""Parse a raw HED string into a :class:`StringNode` tree.
Uses :meth:`~hed.models.HedString.split_hed_string` to tokenise the input
(the same splitter used by the full HED parser) and builds a lightweight
:class:`StringNode` tree without constructing any :class:`~hed.models.HedTag`
or :class:`~hed.models.HedGroup` objects.
The root node is a non-parenthesised container (``is_group=False``) that
mirrors :class:`~hed.models.HedString`. Parenthesised sub-groups become
child ``StringNode`` instances with ``is_group=True``. Individual tag
strings become leaf ``StringNode`` instances with their text stored
casefolded.
Parameters:
raw_string (str): A raw HED string such as ``"(Red, Square), Blue"``.
schema_lookup (dict or None): Optional mapping produced by
:func:`~hed.models.schema_lookup.generate_schema_lookup`. When
provided, leaf ``tag_terms`` are populated from the lookup, enabling
ancestor search on short-form strings.
Returns:
StringNode: Root node of the parsed tree.
Notes:
- Malformed strings (unbalanced parentheses) produce a partial tree;
no exception is raised at parse time — mirroring HedString behaviour.
- Whitespace is stripped from tag text. Empty tag tokens are ignored.
"""
root = StringNode(text=None, is_group=False, depth=0)
# Stack of currently open group nodes; start with the root.
stack = [root]
try:
tokens = HedString.split_hed_string(raw_string)
except Exception:
return root
for is_hed_tag, (start, end) in tokens:
current = stack[-1]
if is_hed_tag:
tag_text = raw_string[start:end].strip().casefold()
if tag_text:
leaf = StringNode(
text=tag_text, is_group=False, parent=current, depth=len(stack) - 1, schema_lookup=schema_lookup
)
current.children.append(leaf)
else:
# Delimiter token — look for '(' and ')'
fragment = raw_string[start:end]
for char in fragment:
if char == "(":
new_group = StringNode(text=None, is_group=True, parent=stack[-1], depth=len(stack))
stack[-1].children.append(new_group)
stack.append(new_group)
elif char == ")":
if len(stack) > 1:
stack.pop()
return root
# ---------------------------------------------------------------------------
# StringQueryHandler
# ---------------------------------------------------------------------------
[docs]
class StringQueryHandler(QueryHandler):
"""Execute HED queries against raw HED strings without requiring a schema.
Subclasses :class:`~hed.models.QueryHandler` and reuses its tokeniser and
expression-tree compiler unchanged. Only :meth:`search` is overridden to
accept a raw string rather than a :class:`~hed.models.HedString`.
The compiled expression tree is evaluated against a :class:`StringNode` tree
produced by :func:`parse_hed_string`. Because :class:`StringNode` duck-types
the ``HedGroup``/``HedTag`` interface expected by the expression classes, no
changes to :mod:`~hed.models.query_expressions` are required.
Ancestor search
~~~~~~~~~~~~~~~
Without a *schema_lookup* the system falls back to literal term matching
(bare query ``"Event"`` matches only the tag ``"event"``, not its
descendants). Pass a lookup dict from
:func:`~hed.models.schema_lookup.generate_schema_lookup` to enable full
ancestor search on short-form strings.
Example::
handler = StringQueryHandler("Event && Action")
bool(handler.search("Event, Action")) # True (literal match)
Parameters:
expression_string (str): The HED query expression — same syntax as
:class:`~hed.models.QueryHandler`.
"""
[docs]
def search(self, raw_string, schema_lookup=None):
"""Search for the compiled query in a raw HED string.
Parameters:
raw_string (str): The raw HED string to search (any form — short,
long, mixed).
schema_lookup (dict or None): Optional schema lookup dict. When
provided, bare-term queries match descendant tags via
``tag_terms`` expansion.
Returns:
list: List of :class:`~hed.models.query_util.SearchResult` objects.
Evaluate as a bool — ``True`` when at least one match was found.
"""
root = parse_hed_string(raw_string, schema_lookup=schema_lookup)
return self.tree.handle_expr(root)
# ---------------------------------------------------------------------------
# Convenience: list search
# ---------------------------------------------------------------------------
[docs]
def string_search(strings, query, schema_lookup=None):
"""Search a list of HED strings using a query expression.
Compiles the query once and applies it to every element, returning a
list of booleans. ``None``, ``float('nan')``, and empty strings
evaluate to ``False``.
Parameters:
strings (list[str]): A list of raw HED strings.
query (str): A HED query expression (same syntax as
:class:`~hed.models.QueryHandler`).
schema_lookup (dict or None): Optional schema lookup dict for ancestor
search; see :func:`~hed.models.schema_lookup.generate_schema_lookup`.
Returns:
list[bool]: One boolean per input string.
Example::
from hed.models.string_search import string_search
mask = string_search(events["HED"].tolist(), "Sensory-event")
matching_rows = [row for row, m in zip(events.itertuples(), mask) if m]
"""
handler = StringQueryHandler(query)
return [
bool(handler.search(s, schema_lookup=schema_lookup)) if isinstance(s, str) and s else False for s in strings
]