"""Classes responsible for basic character validation of a string or tag."""
import json
import re
import os
from hed.errors.error_reporter import ErrorHandler
from hed.errors.error_types import ValidationErrors
CLASS_REX_FILENAME = "../data/class_regex.json"
[docs]
class CharValidator:
"""Class responsible for basic character level validation of a string or tag."""
# # sign is allowed by default as it is specifically checked for separately.
DEFAULT_ALLOWED_PLACEHOLDER_CHARS = ".+-^ _#"
# Placeholder characters are checked elsewhere, but by default allowed
TAG_ALLOWED_CHARS = "-_/"
INVALID_STRING_CHARS = "[]{}~"
INVALID_STRING_CHARS_PLACEHOLDERS = "[]~"
def __init__(self, modern_allowed_char_rules=False):
"""Does basic character validation for HED strings/tags
Parameters:
modern_allowed_char_rules(bool): If True, use 8.3 style rules for unicode characters.
"""
self._validate_characters = modern_allowed_char_rules
[docs]
def check_invalid_character_issues(self, hed_string, allow_placeholders) -> list[dict]:
"""Report invalid characters.
Parameters:
hed_string (str): A HED string.
allow_placeholders (bool): Allow placeholder and curly brace characters.
Returns:
list: Validation issues. Each issue is a dictionary.
Notes:
- Invalid tag characters are defined by self.INVALID_STRING_CHARS or
self.INVALID_STRING_CHARS_PLACEHOLDERS
"""
validation_issues = []
invalid_dict = self.INVALID_STRING_CHARS
if allow_placeholders:
invalid_dict = self.INVALID_STRING_CHARS_PLACEHOLDERS
for index, character in enumerate(hed_string):
if self._validate_characters:
if character in invalid_dict or not character.isprintable():
validation_issues += self._report_invalid_character_error(hed_string, index)
else:
if character in invalid_dict or ord(character) > 127:
validation_issues += self._report_invalid_character_error(hed_string, index)
return validation_issues
[docs]
def check_tag_invalid_chars(self, original_tag, allow_placeholders) -> list[dict]:
"""Report invalid characters in the given tag.
Parameters:
original_tag (HedTag): The original tag that is used to report the error.
allow_placeholders (bool): Allow placeholder characters(#) if True.
Returns:
list: Validation issues. Each issue is a dictionary.
"""
validation_issues = self._check_invalid_prefix_issues(original_tag)
allowed_chars = self.TAG_ALLOWED_CHARS
if allow_placeholders:
allowed_chars += "#"
validation_issues += self._check_invalid_chars(original_tag.org_base_tag, allowed_chars, original_tag)
return validation_issues
[docs]
def check_for_invalid_extension_chars(
self, original_tag, validate_text, error_code=None, index_offset=0
) -> list[dict]:
"""Report invalid characters in extension/value.
Parameters:
original_tag (HedTag): The original tag that is used to report the error.
validate_text (str): the text we want to validate, if not the full extension.
error_code (str): The code to override the error as. Again mostly for def/def-expand tags.
index_offset (int): Offset into the extension validate_text starts at.
Returns:
list: Validation issues. Each issue is a dictionary.
"""
allowed_chars = self.TAG_ALLOWED_CHARS
allowed_chars += self.DEFAULT_ALLOWED_PLACEHOLDER_CHARS
allowed_chars += " "
return self._check_invalid_chars(
validate_text,
allowed_chars,
original_tag,
starting_index=len(original_tag.org_base_tag) + 1 + index_offset,
error_code=error_code,
)
@staticmethod
def _check_invalid_chars(check_string, allowed_chars, source_tag, starting_index=0, error_code=None):
"""Helper for checking for invalid characters.
Parameters:
check_string (str): String to be checked for invalid characters.
allowed_chars (str): Characters allowed in string.
source_tag (HedTag): Tag from which the string came from.
starting_index (int): Starting index of check_string within the tag.
error_code (str): The code to override the error as. Again mostly for def/def-expand tags.
Returns:
list: List of dictionaries with validation issues.
"""
validation_issues = []
for i, character in enumerate(check_string):
if character.isalnum():
continue
if character in allowed_chars:
continue
# Todo: Remove this patch when clock times and invalid characters are more properly checked
if character == ":":
continue
validation_issues += ErrorHandler.format_error(
ValidationErrors.INVALID_TAG_CHARACTER,
tag=source_tag,
index_in_tag=starting_index + i,
index_in_tag_end=starting_index + i + 1,
actual_error=error_code,
)
return validation_issues
@staticmethod
def _check_invalid_prefix_issues(original_tag):
"""Check for invalid schema namespace.
Parameters:
original_tag (HedTag): Tag to look
Returns:
list: List of dictionaries with validation issues.
"""
issues = []
schema_namespace = original_tag.schema_namespace
if schema_namespace and not schema_namespace[:-1].isalpha():
issues += ErrorHandler.format_error(
ValidationErrors.TAG_NAMESPACE_PREFIX_INVALID, tag=original_tag, tag_namespace=schema_namespace
)
return issues
@staticmethod
def _report_invalid_character_error(hed_string, index):
"""Report an invalid character.
Parameters:
hed_string (str): The HED string that caused the error.
index (int): The index of the invalid character in the HED string.
Returns:
list: A singleton list with a dictionary representing the error.
"""
error_type = ValidationErrors.CHARACTER_INVALID
character = hed_string[index]
if character == "~":
error_type = ValidationErrors.TILDES_UNSUPPORTED
return ErrorHandler.format_error(error_type, char_index=index, source_string=hed_string)
[docs]
class CharRexValidator(CharValidator):
"""Class responsible for basic character level validation of a string or tag."""
def __init__(self, modern_allowed_char_rules=False):
"""Does basic character validation for HED strings/tags
Parameters:
modern_allowed_char_rules(bool): If True, use 8.3 style rules for Unicode characters.
"""
super().__init__(modern_allowed_char_rules)
self._rex_dict = self._get_rex_dict()
[docs]
def get_problem_chars(self, in_str, cname):
"""Return a list of (index, char) pairs for characters in in_str not allowed by the value class cname.
Parameters:
in_str (str): The string to check.
cname (str): The value class name used to look up allowed character classes.
Returns:
list[tuple[int, str]]: Each tuple contains the character index and the offending character.
"""
# List to store problem indices and characters
bad_indices = []
# Retrieve the allowed character classes for the given class_name
allowed_classes = self._rex_dict["class_chars"].get(cname, [])
if not allowed_classes:
return bad_indices
# Combine the corresponding regular expressions from the char_regex section
allowed_regex_parts = [self._rex_dict["char_regex"][char_class] for char_class in allowed_classes]
# Create one combined regex that matches any of the allowed character classes
combined_regex = "|".join(allowed_regex_parts)
# Compile the combined regular expression
compiled_regex = re.compile(combined_regex)
# Iterate through the input string, checking each character
for index, char in enumerate(in_str):
# If the character doesn't match the combined regex, it's a problem
if not compiled_regex.match(char):
bad_indices.append((index, char))
return bad_indices
[docs]
def is_valid_value(self, in_string, cname):
"""Check whether in_string is a valid whole-word value for class cname.
Parameters:
in_string (str): The string to validate.
cname (str): The value class name to look up the word-level regex for.
Returns:
True | re.Match | False:
- ``True`` if no word-level regex is defined for *cname* (class imposes no constraint).
- A ``re.Match`` object if *in_string* matches the word-level regex (valid value).
- ``False`` if *in_string* does not match the word-level regex (invalid value).
"""
# Retrieve the allowed character classes for the given class_name
class_regex = self._rex_dict["class_words"].get(cname, [])
if not class_regex:
return True
match = re.match(class_regex, in_string)
match = match if match else False
return match
@staticmethod
def _get_rex_dict():
current_dir = os.path.dirname(os.path.abspath(__file__))
json_path = os.path.realpath(os.path.join(current_dir, CLASS_REX_FILENAME))
with open(json_path, "r", encoding="utf-8") as f:
return json.load(f)