Source code for hed.models.string_search

"""Lightweight HED search operating directly on raw HED strings.

Provides a string-based search system that sits between the fast but limited
:mod:`basic_search` (flat regex on pandas Series) and the full
:class:`~hed.models.QueryHandler` (requires :class:`~hed.models.HedString` objects
with a loaded schema).

Key classes:

- :class:`StringNode` — minimal tree node built from a raw HED string at parse
  time; duck-types both ``HedGroup`` and ``HedTag`` so that the existing
  :class:`~hed.models.query_expressions.Expression` subclasses can evaluate
  against it without modification.
- :class:`StringQueryHandler` — subclasses :class:`~hed.models.QueryHandler`,
  overriding :meth:`search` to accept a raw string instead of a
  :class:`~hed.models.HedString`.

Ancestor search support
-----------------------
Without a *schema lookup table* (see :mod:`hed.models.schema_lookup`), bare term
queries fall back to literal (casefold) matching.  To enable ancestor search on
short-form strings supply a ``schema_lookup`` dict generated by
:func:`~hed.models.schema_lookup.generate_schema_lookup`::

    from hed.models.schema_lookup import generate_schema_lookup
    from hed.models.string_search import StringQueryHandler

    lookup = generate_schema_lookup(schema)
    handler = StringQueryHandler("Event")
    results = handler.search("Sensory-event", schema_lookup=lookup)

Without a lookup, long-form strings support ancestor search for free because
``"Event/Sensory-event"`` is parsed to ``tag_terms = ("event", "sensory-event")``,
so a bare query ``"Event"`` matches it via the standard ``term in tag_terms`` check.
"""

from __future__ import annotations

from collections import deque

from hed.models.query_handler import QueryHandler
from hed.models.hed_string import HedString


[docs] class StringNode: """Lightweight tree node representing a parsed fragment of a raw HED string. A single ``StringNode`` acts as both a *group* (when ``is_group=True``, analogous to :class:`~hed.models.HedGroup`) and a *tag leaf* (when ``is_group=False`` and the node is a direct child of a group, analogous to :class:`~hed.models.HedTag`). The root node is always treated as the top-level string container (analogous to :class:`~hed.models.HedString`): it is never parenthesised (``is_group=False``) but still participates in group traversal. Duck-typing contract with the expression evaluation layer: - ``is_group`` — bool. - ``_parent`` — parent ``StringNode`` or ``None``. - ``children`` — direct child nodes (tags and groups). - ``tags()`` — direct child tag (non-group leaf) nodes. - ``groups()`` — direct child group nodes. - ``get_all_groups()`` — all group-like nodes in the subtree, including self. - ``get_all_tags()`` — all leaf tag nodes in the subtree. - ``find_tags_with_term(term, ...)`` — ancestor-aware tag search. - ``find_exact_tags(exact_tags, ...)`` — casefold-exact tag search. - ``find_wildcard_tags(search_tags, ...)`` — prefix tag search. - ``tag_terms`` — tuple of casefolded ancestry components (set on leaves). - ``short_tag`` — casefolded tag text (set on leaves), used by wildcard search. Parameters: text (str or None): Casefolded tag text for leaf nodes; ``None`` for the root node. is_group (bool): ``True`` if this node represents a parenthesised group. parent (StringNode or None): The parent node. depth (int): Nesting depth (root = 0). schema_lookup (dict or None): Optional schema lookup dict (:func:`~hed.models.schema_lookup.generate_schema_lookup`) used to populate ``tag_terms`` for ancestor search. """ def __init__(self, text=None, is_group=False, parent=None, depth=0, schema_lookup=None): self.text = text # casefolded; only meaningful for leaf (non-group, non-root) nodes self.is_group = is_group self._parent = parent self.depth = depth self.children = [] # --- HedTag duck-typing attributes (set on leaf tag nodes) --- if text is not None and not is_group: # short_tag: full casefolded tag text (including any slash/value component). # Mirrors HedTag.short_tag which also includes the value, e.g. "Def/DefName". # Used by find_wildcard_tags prefix matching: "Def/Def*" → prefix "def/def", # checks text.startswith("def/def") rather than only the last component. self.short_tag = text # tag_terms: tuple of casefolded path components; used by find_tags_with_term. # For schema lookup, derive from the last slash component (the short tag name). _short_name = text.rsplit("/", 1)[-1] if schema_lookup is not None: # Use lookup table: maps short_name_casefold → tuple(all_ancestor_terms) self.tag_terms = schema_lookup.get(_short_name, (_short_name,)) else: # Derive tag_terms from slash-separated path (handles long-form for free). # "event/sensory-event" → ("event", "sensory-event") so bare "Event" matches. self.tag_terms = tuple(text.split("/")) else: self.short_tag = "" self.tag_terms = () # ------------------------------------------------------------------ # HedGroup duck-typing interface # ------------------------------------------------------------------
[docs] def tags(self): """Return direct child tag (leaf, non-group) nodes. Returns: list[StringNode]: Direct children that are leaves. """ return [c for c in self.children if not c.is_group]
[docs] def groups(self): """Return direct child group nodes. Returns: list[StringNode]: Direct children that are parenthesised groups. """ return [c for c in self.children if c.is_group]
[docs] def get_all_groups(self): """Return all group-like nodes in this subtree, including self. Mirrors :meth:`HedGroup.get_all_groups` which always includes the receiver (even when the receiver is a :class:`~hed.models.HedString` with ``is_group=False``). Returns: list[StringNode]: All group-like nodes, self first. """ result = [self] stack = list(self.children) while stack: node = stack.pop() if node.is_group: result.append(node) stack.extend(node.children) return result
[docs] def get_all_tags(self): """Return all leaf tag nodes in this subtree (depth-first). Returns: list[StringNode]: All leaf nodes in the subtree. """ result = [] queue = deque([self]) while queue: node = queue.popleft() for child in node.children: if child.is_group: queue.append(child) else: result.append(child) return result
[docs] def find_tags_with_term(self, term, recursive=False, include_groups=2): """Find leaf tags whose ``tag_terms`` include *term* (ancestor search). When no schema lookup was provided at parse time, ``tag_terms`` for a leaf is derived from the slash-separated components of the tag text, so long-form strings give ancestor search for free; short-form strings produce literal matching only. Parameters: term (str): The casefolded term to search for. recursive (bool): If True, search all descendants; otherwise only direct children. include_groups (int): Controls return format. ``0`` — tags only; ``1`` — parent groups only; ``2`` — ``(tag, group)`` pairs. Returns: list: Depends on *include_groups*. """ tags = self.get_all_tags() if recursive else self.tags() search_for = term.casefold() found = [(tag, tag._parent) for tag in tags if search_for in tag.tag_terms] if include_groups in (0, 1): return [pair[include_groups] for pair in found] return found
[docs] def find_exact_tags(self, exact_tags, recursive=False, include_groups=1): """Find leaf tags whose casefolded text exactly matches any entry in *exact_tags*. Parameters: exact_tags (list[str]): Tags to match against (compared casefolded). recursive (bool): If True, search all descendants. include_groups (int): ``0`` — tags; ``1`` — groups; ``2`` — pairs. Returns: list: Depends on *include_groups*. """ tags = self.get_all_tags() if recursive else self.tags() # token.text is already casefolded by QueryHandler; self.text is casefolded at construction casefolded_targets = {t.casefold() for t in exact_tags} found = [(tag, tag._parent) for tag in tags if tag.text in casefolded_targets] if include_groups in (0, 1): return [pair[include_groups] for pair in found] return found
[docs] def find_wildcard_tags(self, search_tags, recursive=False, include_groups=2): """Find leaf tags whose ``short_tag`` starts with any entry in *search_tags*. Parameters: search_tags (list[str]): Prefix strings (casefolded). recursive (bool): If True, search all descendants. include_groups (int): ``0`` — tags; ``1`` — groups; ``2`` — pairs. Returns: list: Depends on *include_groups*. """ tags = self.get_all_tags() if recursive else self.tags() prefixes = {s.casefold() for s in search_tags} found = [] for tag in tags: for prefix in prefixes: if tag.short_tag.startswith(prefix): found.append((tag, tag._parent)) break if include_groups in (0, 1): return [pair[include_groups] for pair in found] return found
# ------------------------------------------------------------------ # HedTag duck-typing interface # ------------------------------------------------------------------ def __eq__(self, other): """Compare with another StringNode or a string. When *other* is a string, compares ``self.text`` (already casefolded) against ``other.casefold()``. This mirrors ``HedTag.__eq__`` and is required so that ``tag in exact_tags`` works in :meth:`find_exact_tags`. Parameters: other (StringNode or str): The value to compare against. Returns: bool: True if equal. """ if isinstance(other, str): return self.text == other.casefold() if isinstance(other, StringNode): return self is other return NotImplemented def __hash__(self): return id(self) def __str__(self): if self.is_group: return "(" + ",".join(str(c) for c in self.children) + ")" if self.text is None: return ",".join(str(c) for c in self.children) return self.text def __repr__(self): kind = "group" if self.is_group else ("root" if self.text is None else "tag") return f"StringNode({kind}, text={self.text!r}, children={len(self.children)})"
# --------------------------------------------------------------------------- # Parser # ---------------------------------------------------------------------------
[docs] def parse_hed_string(raw_string, schema_lookup=None): """Parse a raw HED string into a :class:`StringNode` tree. Uses :meth:`~hed.models.HedString.split_hed_string` to tokenise the input (the same splitter used by the full HED parser) and builds a lightweight :class:`StringNode` tree without constructing any :class:`~hed.models.HedTag` or :class:`~hed.models.HedGroup` objects. The root node is a non-parenthesised container (``is_group=False``) that mirrors :class:`~hed.models.HedString`. Parenthesised sub-groups become child ``StringNode`` instances with ``is_group=True``. Individual tag strings become leaf ``StringNode`` instances with their text stored casefolded. Parameters: raw_string (str): A raw HED string such as ``"(Red, Square), Blue"``. schema_lookup (dict or None): Optional mapping produced by :func:`~hed.models.schema_lookup.generate_schema_lookup`. When provided, leaf ``tag_terms`` are populated from the lookup, enabling ancestor search on short-form strings. Returns: StringNode: Root node of the parsed tree. Notes: - Malformed strings (unbalanced parentheses) produce a partial tree; no exception is raised at parse time — mirroring HedString behaviour. - Whitespace is stripped from tag text. Empty tag tokens are ignored. """ root = StringNode(text=None, is_group=False, depth=0) # Stack of currently open group nodes; start with the root. stack = [root] try: tokens = HedString.split_hed_string(raw_string) except Exception: return root for is_hed_tag, (start, end) in tokens: current = stack[-1] if is_hed_tag: tag_text = raw_string[start:end].strip().casefold() if tag_text: leaf = StringNode( text=tag_text, is_group=False, parent=current, depth=len(stack) - 1, schema_lookup=schema_lookup ) current.children.append(leaf) else: # Delimiter token — look for '(' and ')' fragment = raw_string[start:end] for char in fragment: if char == "(": new_group = StringNode(text=None, is_group=True, parent=stack[-1], depth=len(stack)) stack[-1].children.append(new_group) stack.append(new_group) elif char == ")": if len(stack) > 1: stack.pop() return root
# --------------------------------------------------------------------------- # StringQueryHandler # ---------------------------------------------------------------------------
[docs] class StringQueryHandler(QueryHandler): """Execute HED queries against raw HED strings without requiring a schema. Subclasses :class:`~hed.models.QueryHandler` and reuses its tokeniser and expression-tree compiler unchanged. Only :meth:`search` is overridden to accept a raw string rather than a :class:`~hed.models.HedString`. The compiled expression tree is evaluated against a :class:`StringNode` tree produced by :func:`parse_hed_string`. Because :class:`StringNode` duck-types the ``HedGroup``/``HedTag`` interface expected by the expression classes, no changes to :mod:`~hed.models.query_expressions` are required. Ancestor search ~~~~~~~~~~~~~~~ Without a *schema_lookup* the system falls back to literal term matching (bare query ``"Event"`` matches only the tag ``"event"``, not its descendants). Pass a lookup dict from :func:`~hed.models.schema_lookup.generate_schema_lookup` to enable full ancestor search on short-form strings. Example:: handler = StringQueryHandler("Event && Action") bool(handler.search("Event, Action")) # True (literal match) Parameters: expression_string (str): The HED query expression — same syntax as :class:`~hed.models.QueryHandler`. """
[docs] def search(self, raw_string, schema_lookup=None): """Search for the compiled query in a raw HED string. Parameters: raw_string (str): The raw HED string to search (any form — short, long, mixed). schema_lookup (dict or None): Optional schema lookup dict. When provided, bare-term queries match descendant tags via ``tag_terms`` expansion. Returns: list: List of :class:`~hed.models.query_util.SearchResult` objects. Evaluate as a bool — ``True`` when at least one match was found. """ root = parse_hed_string(raw_string, schema_lookup=schema_lookup) return self.tree.handle_expr(root)
# --------------------------------------------------------------------------- # Convenience: list search # ---------------------------------------------------------------------------