Source: parser/tokenizer.js

import { unicodeName } from 'unicode-name'

import { generateIssue } from '../issues/issues'

const CHARACTERS = {
  BLANK: ' ',
  OPENING_GROUP: '(',
  CLOSING_GROUP: ')',
  OPENING_COLUMN: '{',
  CLOSING_COLUMN: '}',
  COMMA: ',',
  COLON: ':',
  SLASH: '/',
  PLACEHOLDER: '#',
}

function getTrimmedBounds(originalString) {
  const start = originalString.search(/\S/)

  if (start === -1) {
    // The string contains only whitespace
    return null
  }
  const end = originalString.search(/\S\s*$/)
  return [start, end + 1]
}

const invalidCharacters = new Set(['[', ']', '~', '"'])
// Add control codes to invalidCharacters
for (let i = 0x00; i <= 0x1f; i++) {
  invalidCharacters.add(String.fromCodePoint(i))
}
for (let i = 0x7f; i <= 0x9f; i++) {
  invalidCharacters.add(String.fromCodePoint(i))
}

/**
 * A specification for a tokenized substring.
 */
export class SubstringSpec {
  /**
   * The starting and ending bounds of the substring.
   * @type {number[]}
   */
  bounds

  constructor(start, end) {
    this.bounds = [start, end]
  }
}

/**
 * A specification for a tokenized tag.
 */
export class TagSpec extends SubstringSpec {
  /**
   * The tag this spec represents.
   * @type {string}
   */
  tag

  /**
   * The schema prefix for this tag, if any.
   * @type {string}
   */
  library

  constructor(tag, start, end, librarySchema) {
    super(start, end)

    this.tag = tag.trim()
    this.library = librarySchema
  }
}

/**
 * A specification for a tokenized tag group.
 */
export class GroupSpec extends SubstringSpec {
  /**
   * The child group specifications.
   * @type {GroupSpec[]}
   */
  children

  constructor(start, end, children) {
    super(start, end)

    this.children = children
  }
}

/**
 * A specification for a tokenized column splice template.
 */
export class ColumnSpliceSpec extends SubstringSpec {
  /**
   * The column name this spec refers to.
   * @type {string}
   */
  columnName

  constructor(name, start, end) {
    super(start, end)

    this.columnName = name.trim()
  }
}

class TokenizerState {
  constructor() {
    this.currentToken = '' // Characters in the token currently being parsed
    this.groupDepth = 0
    this.startingIndex = 0 // Starting index of this token
    this.lastDelimiter = [undefined, -1] // Type and position of the last delimiter
    this.librarySchema = ''
    this.lastSlash = -1 // Position of the last slash in current token
    this.currentGroupStack = [[]]
    this.parenthesesStack = []
  }
}

/**
 * Class for tokenizing HED strings.
 */
export class HedStringTokenizer {
  constructor(hedString) {
    this.hedString = hedString
    this.issues = []
    this.state = null
  }

  /**
   * Split the HED string into delimiters and tags.
   *
   * @returns {Array} - [TagSpec[], GroupSpec, Issue[]] representing the tag specifications, group bounds, and any issues found.
   */
  tokenize() {
    this.initializeTokenizer()
    // Empty strings cannot be tokenized
    if (this.hedString.trim().length === 0) {
      this.pushIssue('emptyTagFound', 0)
      return [[], null, this.issues]
    }
    for (let i = 0; i < this.hedString.length; i++) {
      const character = this.hedString.charAt(i)
      this.handleCharacter(i, character)
      if (this.issues.length > 0) {
        return [[], null, this.issues]
      }
    }
    this.finalizeTokenizer()
    if (this.issues.length > 0) {
      return [[], null, this.issues]
    } else {
      return [this.state.currentGroupStack.pop(), this.state.parenthesesStack.pop(), []]
    }
  }

  resetToken(i) {
    this.state.startingIndex = i + 1
    this.state.currentToken = ''
    this.state.librarySchema = ''
    this.state.lastSlash = '-1'
  }

  finalizeTokenizer() {
    if (this.state.lastDelimiter[0] === CHARACTERS.OPENING_COLUMN) {
      // Extra opening brace
      this.pushIssue('unclosedCurlyBrace', this.state.lastDelimiter[1])
    } else if (this.state.lastDelimiter[0] === CHARACTERS.OPENING_GROUP) {
      // Extra opening parenthesis
      this.pushIssue('unclosedParentheses', this.state.lastDelimiter[1])
    } else if (
      this.state.lastDelimiter[0] === CHARACTERS.COMMA &&
      this.hedString.slice(this.state.lastDelimiter[1] + 1).trim().length === 0
    ) {
      this.pushIssue('emptyTagFound', this.state.lastDelimiter[1]) // Extra comma
    } else if (this.state.lastSlash >= 0 && this.hedString.slice(this.state.lastSlash + 1).trim().length === 0) {
      this.pushIssue('extraSlash', this.state.lastSlash) // Extra slash
    }
    if (
      this.state.currentToken.trim().length > 0 &&
      ![undefined, CHARACTERS.COMMA].includes(this.state.lastDelimiter[0])
    ) {
      // Missing comma
      this.pushIssue('commaMissing', this.state.lastDelimiter[1] + 1)
    } else {
      if (this.state.currentToken.trim().length > 0) {
        this.pushTag(this.hedString.length)
      }
      this.unwindGroupStack()
    }
  }

  initializeTokenizer() {
    this.issues = []
    this.state = new TokenizerState()
    this.state.parenthesesStack = [new GroupSpec(0, this.hedString.length, [])]
  }

  handleCharacter(i, character) {
    const characterHandler = {
      [CHARACTERS.OPENING_GROUP]: () => this.handleOpeningGroup(i),
      [CHARACTERS.CLOSING_GROUP]: () => this.handleClosingGroup(i),
      [CHARACTERS.OPENING_COLUMN]: () => this.handleOpeningColumn(i),
      [CHARACTERS.CLOSING_COLUMN]: () => this.handleClosingColumn(i),
      [CHARACTERS.COMMA]: () => this.handleComma(i),
      [CHARACTERS.COLON]: () => this.handleColon(i),
      [CHARACTERS.SLASH]: () => this.handleSlash(i),
    }[character] // Selects the character handler based on the value of character

    if (characterHandler) {
      characterHandler()
    } else if (invalidCharacters.has(character)) {
      this.pushInvalidCharacterIssue(character, i)
    } else {
      this.state.currentToken += character
    }
  }

  handleComma(i) {
    const trimmed = this.hedString.slice(this.state.lastDelimiter[1] + 1, i).trim()
    if (
      [CHARACTERS.OPENING_GROUP, CHARACTERS.COMMA, undefined].includes(this.state.lastDelimiter[0]) &&
      trimmed.length === 0
    ) {
      this.pushIssue('emptyTagFound', i) // Empty tag Ex: ",x" or "(, x" or "y, ,x"
    } else if (this.state.lastDelimiter[0] === CHARACTERS.OPENING_COLUMN) {
      this.pushIssue('unclosedCurlyBrace', this.state.lastDelimiter[1]) // Unclosed curly brace Ex: "{ x,"
    }
    if (
      [CHARACTERS.CLOSING_GROUP, CHARACTERS.CLOSING_COLUMN].includes(this.state.lastDelimiter[0]) &&
      trimmed.length > 0
    ) {
      // A tag followed a group or column with no comma Ex:  (x) yz
      this.pushIssue('invalidTag', i, trimmed)
    } else if (trimmed.length > 0) {
      this.pushTag(i) // Tag has just finished
    } else {
      this.resetToken(i) // After a group or column
    }
    this.state.lastDelimiter = [CHARACTERS.COMMA, i]
  }

  handleSlash(i) {
    if (this.state.currentToken.trim().length === 0) {
      // Slash at beginning of tag.
      this.pushIssue('extraSlash', i) // Slash at beginning of tag.
    } else if (this.state.lastSlash >= 0 && this.hedString.slice(this.state.lastSlash + 1, i).trim().length === 0) {
      this.pushIssue('extraSlash', i) // Slashes with only blanks between
    } else if (i > 0 && this.hedString.charAt(i - 1) === CHARACTERS.BLANK) {
      this.pushIssue('extraBlank', i - 1) // Blank before slash such as slash in value
    } else if (i < this.hedString.length - 1 && this.hedString.charAt(i + 1) === CHARACTERS.BLANK) {
      this.pushIssue('extraBlank', i + 1) //Blank after a slash
    } else if (this.hedString.slice(i).trim().length === 0) {
      this.pushIssue('extraSlash', this.state.startingIndex) // Extra slash at the end
    } else {
      this.state.currentToken += CHARACTERS.SLASH
      this.state.lastSlash = i
    }
  }

  handleOpeningGroup(i) {
    if (this.state.lastDelimiter[0] === CHARACTERS.OPENING_COLUMN) {
      this.pushIssue('unclosedCurlyBrace', this.state.lastDelimiter[1]) // After open curly brace Ex: "{  ("
    } else if (this.state.lastDelimiter[0] === CHARACTERS.CLOSING_COLUMN) {
      this.pushIssue('commaMissing', this.state.lastDelimiter[1]) // After close curly brace Ex: "} ("
    } else if (this.state.lastDelimiter[0] === CHARACTERS.CLOSING_GROUP) {
      this.pushIssue('commaMissing', this.state.lastDelimiter[1] + 1) // After close group Ex: ") ("
    } else if (this.state.currentToken.trim().length > 0) {
      this.pushInvalidTag('commaMissing', i, this.state.currentToken.trim()) // After tag Ex: "x ("
    } else {
      this.state.currentGroupStack.push([])
      this.state.parenthesesStack.push(new GroupSpec(i, undefined, []))
      this.resetToken(i)
      this.state.groupDepth++
      this.state.lastDelimiter = [CHARACTERS.OPENING_GROUP, i]
    }
  }

  handleClosingGroup(i) {
    if (this.state.groupDepth <= 0) {
      this.pushIssue('unopenedParenthesis', i) // No corresponding opening group
    } else if (this.state.lastDelimiter[0] === CHARACTERS.OPENING_COLUMN) {
      this.pushIssue('unclosedCurlyBrace', this.state.lastDelimiter[1]) // After open curly brace Ex: "{ )"
    } else {
      if ([CHARACTERS.OPENING_GROUP, CHARACTERS.COMMA].includes(this.state.lastDelimiter[0])) {
        // Should be a tag here
        this.pushTag(i)
      }
      this.closeGroup(i) // Close the group by updating its bounds and moving it to the parent group.
      this.state.lastDelimiter = [CHARACTERS.CLOSING_GROUP, i]
    }
  }

  handleOpeningColumn(i) {
    if (this.state.currentToken.trim().length > 0) {
      this.pushInvalidCharacterIssue(CHARACTERS.OPENING_COLUMN, i) // Middle of a token Ex: "x {"
    } else if (this.state.lastDelimiter[0] === CHARACTERS.OPENING_COLUMN) {
      this.pushIssue('nestedCurlyBrace', i) // After open curly brace   Ex: "{x{"
    } else {
      this.state.lastDelimiter = [CHARACTERS.OPENING_COLUMN, i]
    }
  }

  handleClosingColumn(i) {
    if (this.state.lastDelimiter[0] !== CHARACTERS.OPENING_COLUMN) {
      this.pushIssue('unopenedCurlyBrace', i) // No matching open brace Ex: " x}"
    } else if (!this.state.currentToken.trim()) {
      this.pushIssue('emptyCurlyBrace', i) // Column slice cannot be empty Ex: "{  }"
    } else {
      // Close column by updating bounds and moving it to the parent group, push a column splice on the stack.
      this.state.currentGroupStack[this.state.groupDepth].push(
        new ColumnSpliceSpec(this.state.currentToken.trim(), this.state.lastDelimiter[1], i),
      )
      this.resetToken(i)
      this.state.lastDelimiter = [CHARACTERS.CLOSING_COLUMN, i]
    }
  }

  handleColon(i) {
    const trimmed = this.state.currentToken.trim()
    if (this.state.librarySchema || trimmed.includes(CHARACTERS.BLANK) || trimmed.includes(CHARACTERS.SLASH)) {
      this.state.currentToken += CHARACTERS.COLON // If colon has been seen or is part of a value.
    } else if (/[^A-Za-z]/.test(trimmed)) {
      this.pushIssue('invalidTagPrefix', i) // Prefix not alphabetic Ex:  "1a:xxx"
    } else {
      const lib = this.state.currentToken.trimStart()
      this.resetToken(i)
      this.state.librarySchema = lib
    }
  }

  unwindGroupStack() {
    while (this.state.groupDepth > 0) {
      this.pushIssue(
        'unclosedParenthesis',
        this.state.parenthesesStack[this.state.parenthesesStack.length - 1].bounds[0],
      )
      this.closeGroup(this.hedString.length)
    }
  }

  pushTag(i) {
    if (this.state.currentToken.trim().length === 0) {
      this.pushIssue('emptyTagFound', i)
    } else if (this.checkForBadPlaceholderIssues(i)) {
      this.pushInvalidTag('invalidPlaceholder', i, this.state.currentToken)
    } else {
      const bounds = getTrimmedBounds(this.state.currentToken)
      this.state.currentGroupStack[this.state.groupDepth].push(
        new TagSpec(
          this.state.currentToken.trim(),
          this.state.startingIndex + bounds[0],
          this.state.startingIndex + bounds[1],
          this.state.librarySchema,
        ),
      )
      this.resetToken(i)
    }
  }

  checkForBadPlaceholderIssues() {
    const tokenSplit = this.state.currentToken.split(CHARACTERS.PLACEHOLDER)
    if (tokenSplit.length === 1) {
      // No placeholders to worry about for this tag
      return false
    }
    return (
      tokenSplit.length > 2 ||
      !tokenSplit[0].endsWith(CHARACTERS.SLASH) || // A placeholder must be after a slash
      (tokenSplit[1].trim().length > 0 && tokenSplit[1][0] !== CHARACTERS.BLANK)
    )
  }

  closeGroup(i) {
    const groupSpec = this.state.parenthesesStack.pop()
    groupSpec.bounds[1] = i + 1
    if (this.hedString.slice(groupSpec.bounds[0] + 1, i).trim().length === 0) {
      this.pushIssue('emptyTagFound', i) //The group is empty
    }
    this.state.parenthesesStack[this.state.groupDepth - 1].children.push(groupSpec)
    this.state.currentGroupStack[this.state.groupDepth - 1].push(this.state.currentGroupStack.pop())
    this.state.groupDepth--
  }

  pushIssue(issueCode, index) {
    this.issues.push(generateIssue(issueCode, { index, string: this.hedString }))
  }

  pushInvalidTag(issueCode, index, tag) {
    this.issues.push(generateIssue(issueCode, { index, tag: tag, string: this.hedString }))
  }

  pushInvalidCharacterIssue(character, index) {
    this.issues.push(
      generateIssue('invalidCharacter', { character: unicodeName(character), index, string: this.hedString }),
    )
  }
}