Source code for hed.validator.util.char_util

"""Classes responsible for basic character validation of a string or tag."""

import json
import re
import os

from hed.errors.error_reporter import ErrorHandler
from hed.errors.error_types import ValidationErrors

CLASS_REX_FILENAME = "../data/class_regex.json"


[docs] class CharValidator: """Class responsible for basic character level validation of a string or tag.""" # # sign is allowed by default as it is specifically checked for separately. DEFAULT_ALLOWED_PLACEHOLDER_CHARS = ".+-^ _#" # Placeholder characters are checked elsewhere, but by default allowed TAG_ALLOWED_CHARS = "-_/" INVALID_STRING_CHARS = "[]{}~" INVALID_STRING_CHARS_PLACEHOLDERS = "[]~" def __init__(self, modern_allowed_char_rules=False): """Does basic character validation for HED strings/tags Parameters: modern_allowed_char_rules(bool): If True, use 8.3 style rules for unicode characters. """ self._validate_characters = modern_allowed_char_rules
[docs] def check_invalid_character_issues(self, hed_string, allow_placeholders) -> list[dict]: """Report invalid characters. Parameters: hed_string (str): A HED string. allow_placeholders (bool): Allow placeholder and curly brace characters. Returns: list: Validation issues. Each issue is a dictionary. Notes: - Invalid tag characters are defined by self.INVALID_STRING_CHARS or self.INVALID_STRING_CHARS_PLACEHOLDERS """ validation_issues = [] invalid_dict = self.INVALID_STRING_CHARS if allow_placeholders: invalid_dict = self.INVALID_STRING_CHARS_PLACEHOLDERS for index, character in enumerate(hed_string): if self._validate_characters: if character in invalid_dict or not character.isprintable(): validation_issues += self._report_invalid_character_error(hed_string, index) else: if character in invalid_dict or ord(character) > 127: validation_issues += self._report_invalid_character_error(hed_string, index) return validation_issues
[docs] def check_tag_invalid_chars(self, original_tag, allow_placeholders) -> list[dict]: """Report invalid characters in the given tag. Parameters: original_tag (HedTag): The original tag that is used to report the error. allow_placeholders (bool): Allow placeholder characters(#) if True. Returns: list: Validation issues. Each issue is a dictionary. """ validation_issues = self._check_invalid_prefix_issues(original_tag) allowed_chars = self.TAG_ALLOWED_CHARS if allow_placeholders: allowed_chars += "#" validation_issues += self._check_invalid_chars(original_tag.org_base_tag, allowed_chars, original_tag) return validation_issues
[docs] def check_for_invalid_extension_chars( self, original_tag, validate_text, error_code=None, index_offset=0 ) -> list[dict]: """Report invalid characters in extension/value. Parameters: original_tag (HedTag): The original tag that is used to report the error. validate_text (str): the text we want to validate, if not the full extension. error_code (str): The code to override the error as. Again mostly for def/def-expand tags. index_offset (int): Offset into the extension validate_text starts at. Returns: list: Validation issues. Each issue is a dictionary. """ allowed_chars = self.TAG_ALLOWED_CHARS allowed_chars += self.DEFAULT_ALLOWED_PLACEHOLDER_CHARS allowed_chars += " " return self._check_invalid_chars( validate_text, allowed_chars, original_tag, starting_index=len(original_tag.org_base_tag) + 1 + index_offset, error_code=error_code, )
@staticmethod def _check_invalid_chars(check_string, allowed_chars, source_tag, starting_index=0, error_code=None): """Helper for checking for invalid characters. Parameters: check_string (str): String to be checked for invalid characters. allowed_chars (str): Characters allowed in string. source_tag (HedTag): Tag from which the string came from. starting_index (int): Starting index of check_string within the tag. error_code (str): The code to override the error as. Again mostly for def/def-expand tags. Returns: list: List of dictionaries with validation issues. """ validation_issues = [] for i, character in enumerate(check_string): if character.isalnum(): continue if character in allowed_chars: continue # Todo: Remove this patch when clock times and invalid characters are more properly checked if character == ":": continue validation_issues += ErrorHandler.format_error( ValidationErrors.INVALID_TAG_CHARACTER, tag=source_tag, index_in_tag=starting_index + i, index_in_tag_end=starting_index + i + 1, actual_error=error_code, ) return validation_issues @staticmethod def _check_invalid_prefix_issues(original_tag): """Check for invalid schema namespace. Parameters: original_tag (HedTag): Tag to look Returns: list: List of dictionaries with validation issues. """ issues = [] schema_namespace = original_tag.schema_namespace if schema_namespace and not schema_namespace[:-1].isalpha(): issues += ErrorHandler.format_error( ValidationErrors.TAG_NAMESPACE_PREFIX_INVALID, tag=original_tag, tag_namespace=schema_namespace ) return issues @staticmethod def _report_invalid_character_error(hed_string, index): """Report an invalid character. Parameters: hed_string (str): The HED string that caused the error. index (int): The index of the invalid character in the HED string. Returns: list: A singleton list with a dictionary representing the error. """ error_type = ValidationErrors.CHARACTER_INVALID character = hed_string[index] if character == "~": error_type = ValidationErrors.TILDES_UNSUPPORTED return ErrorHandler.format_error(error_type, char_index=index, source_string=hed_string)
[docs] class CharRexValidator(CharValidator): """Class responsible for basic character level validation of a string or tag.""" def __init__(self, modern_allowed_char_rules=False): """Does basic character validation for HED strings/tags Parameters: modern_allowed_char_rules(bool): If True, use 8.3 style rules for Unicode characters. """ super().__init__(modern_allowed_char_rules) self._rex_dict = self._get_rex_dict()
[docs] def get_problem_chars(self, in_str, cname): """Return a list of (index, char) pairs for characters in in_str not allowed by the value class cname. Parameters: in_str (str): The string to check. cname (str): The value class name used to look up allowed character classes. Returns: list[tuple[int, str]]: Each tuple contains the character index and the offending character. """ # List to store problem indices and characters bad_indices = [] # Retrieve the allowed character classes for the given class_name allowed_classes = self._rex_dict["class_chars"].get(cname, []) if not allowed_classes: return bad_indices # Combine the corresponding regular expressions from the char_regex section allowed_regex_parts = [self._rex_dict["char_regex"][char_class] for char_class in allowed_classes] # Create one combined regex that matches any of the allowed character classes combined_regex = "|".join(allowed_regex_parts) # Compile the combined regular expression compiled_regex = re.compile(combined_regex) # Iterate through the input string, checking each character for index, char in enumerate(in_str): # If the character doesn't match the combined regex, it's a problem if not compiled_regex.match(char): bad_indices.append((index, char)) return bad_indices
[docs] def is_valid_value(self, in_string, cname): """Check whether in_string is a valid whole-word value for class cname. Parameters: in_string (str): The string to validate. cname (str): The value class name to look up the word-level regex for. Returns: True | re.Match | False: - ``True`` if no word-level regex is defined for *cname* (class imposes no constraint). - A ``re.Match`` object if *in_string* matches the word-level regex (valid value). - ``False`` if *in_string* does not match the word-level regex (invalid value). """ # Retrieve the allowed character classes for the given class_name class_regex = self._rex_dict["class_words"].get(cname, []) if not class_regex: return True match = re.match(class_regex, in_string) match = match if match else False return match
@staticmethod def _get_rex_dict(): current_dir = os.path.dirname(os.path.abspath(__file__)) json_path = os.path.realpath(os.path.join(current_dir, CLASS_REX_FILENAME)) with open(json_path, "r", encoding="utf-8") as f: return json.load(f)