JSDoc: Source: bids/validator/tsvValidator.js

import { BidsHedIssue } from '../types/issues'
import { BidsTsvElement, BidsTsvRow } from '../types/tsv'
import { BidsValidator } from './validator'
import { parseHedString } from '../../parser/parser'
import ParsedHedString from '../../parser/parsedHedString'
import { generateIssue } from '../../issues/issues'
import { ReservedChecker } from '../../parser/reservedChecker'
import { cleanupEmpties, getTagListString } from '../../parser/parseUtils'
import { EventManager } from '../../parser/eventManager'

/**
 * Validator for HED data in BIDS TSV files.
 */
export class BidsHedTsvValidator extends BidsValidator {
  /**
   * The BIDS TSV file being validated.
   * @type {BidsTsvFile}
   */
  tsvFile

  /**
   * The singleton instance of the checker for reserved requirements.
   * @type {ReservedChecker}
   */
  reserved

  /**
   * Constructor.
   *
   * @param {BidsTsvFile} tsvFile - The BIDS TSV file being validated.
   * @param {Schemas} hedSchemas - The HED schemas used to validate the tsv file.
   */
  constructor(tsvFile, hedSchemas) {
    super(hedSchemas)
    this.tsvFile = tsvFile
    this.reserved = ReservedChecker.getInstance()
  }

  /**
   * Validate a BIDS TSV file. This method returns the complete issue list for convenience.
   */
  validate() {
    // Validate the BIDS sidecar if it exists and return if there are errors
    if (this.tsvFile.mergedSidecar) {
      const issues = this.tsvFile.mergedSidecar.validate(this.hedSchemas)
      const splitErrors = BidsHedIssue.splitErrors(issues)
      this.errors.push(...(splitErrors.error ?? []))
      this.warnings.push(...(splitErrors.warning ?? []))
      if (this.errors.length > 0) {
        return
      }
    }

    // Valid the HED column by itself.
    this._validateHedColumn()
    if (this.errors.length > 0) {
      return
    }
    // Now do a full validation
    const bidsHedTsvParser = new BidsHedTsvParser(this.tsvFile, this.hedSchemas)
    const [bidsEvents, errorIssues, warningIssues] = bidsHedTsvParser.parse()
    this.errors.push(...errorIssues)
    this.warnings.push(...warningIssues)
    if (this.errors.length > 0) {
      return
    }
    this.validateDataset(bidsEvents)
    if (this.errors.length === 0 && this.tsvFile.mergedSidecar?.hasHedData) {
      this._checkMissingHedWarning()
      this._checkMissingValueWarnings()
    }
  }

  _checkMissingHedWarning() {
    // Check for HED column used as splice but no HED column
    if (this.tsvFile.mergedSidecar.columnSpliceReferences.has('HED') && !this.tsvFile.parsedTsv.has('HED')) {
      this.warnings.push(BidsHedIssue.fromHedIssue(generateIssue('hedUsedAsSpliceButNoTsvHed', {}), this.tsvFile.file))
    }
  }

  /**
   * Check for categorical column value in tsv but not in sidecar.
   * @private
   */
  _checkMissingValueWarnings() {
    for (const columnName of this.tsvFile.parsedTsv.keys()) {
      const sidecarColumn = this.tsvFile.mergedSidecar?.sidecarKeys.get(columnName)
      if (!sidecarColumn || sidecarColumn.isValueKey) {
        continue
      }
      const toRemove = new Set(['', 'n/a', null, undefined])
      const tsvColumnValues = new Set(this.tsvFile.parsedTsv.get(columnName))
      const cleanedValues = new Set([...tsvColumnValues].filter((value) => !toRemove.has(value)))
      const missingValues = [...cleanedValues].filter((value) => !sidecarColumn.categoryMap.has(value))
      if (missingValues.length > 0) {
        const values = '[' + missingValues.join(', ') + ']'
        this.warnings.push(
          BidsHedIssue.fromHedIssue(
            generateIssue('sidecarKeyMissing', { column: columnName, values: values }),
            this.tsvFile.file,
          ),
        )
      }
    }
  }

  /**
   * Validate this TSV file's HED column.
   *
   * @private
   */
  _validateHedColumn() {
    if (this.tsvFile.hedColumnHedStrings.length > 0) {
      this.tsvFile.hedColumnHedStrings.flatMap((hedString, rowIndexMinusTwo) =>
        this._validateHedColumnString(hedString, rowIndexMinusTwo + 2),
      )
    }
  }

  /**
   * Validate a string in this TSV file's HED column.
   *
   * @param {string} hedString - The string to be validated.
   * @param {number} rowIndex - The index of this row in the TSV file.
   * @private
   */
  _validateHedColumnString(hedString, rowIndex) {
    if (!hedString) {
      return
    }

    // Find basic parsing issues and return if unable to parse the string. (Warnings are okay.)
    const [parsedString, errorIssues, warningIssues] = parseHedString(hedString, this.hedSchemas, false, false, false)
    this.errors.push(...BidsHedIssue.fromHedIssues(errorIssues, this.tsvFile.file, { tsvLine: rowIndex }))
    this.warnings.push(...BidsHedIssue.fromHedIssues(warningIssues, this.tsvFile.file, { tsvLine: rowIndex }))
    if (parsedString === null) {
      return
    }

    // The HED column is not allowed to have column splices.
    if (parsedString.columnSplices.length > 0) {
      this.errors.push(
        BidsHedIssue.fromHedIssue(
          generateIssue('curlyBracesInHedColumn', {
            string: parsedString.hedString,
            tsvLine: rowIndex.toString(),
          }),
          this.tsvFile.file,
        ),
      )
      return
    }

    // Check whether definitions used exist and are used correctly.
    const defIssues = [
      ...this.tsvFile.mergedSidecar.definitions.validateDefs(parsedString, this.hedSchemas, false),
      ...this.tsvFile.mergedSidecar.definitions.validateDefExpands(parsedString, this.hedSchemas, false),
    ]
    this.errors.push(...BidsHedIssue.fromHedIssues(defIssues, this.tsvFile.file, { tsvLine: rowIndex }))
  }

  /**
   * Validate the HED data in a combined event TSV file/sidecar BIDS data collection.
   */
  validateDataset(elements) {
    // Final top-tag detection cannot be done until the strings are fully assembled and finalized.
    this._checkNoTopTags(elements)
    if (this.errors.length > 0) {
      return
    }
    // Temporal files have to check Onset, Inset, Offset consistency.
    if (this.tsvFile.isTimelineFile) {
      this._validateTemporal(elements)
    } else {
      // Non-temporal files cannot have temporal tags.
      this._checkNoTime(elements)
    }
  }

  /**
   * Check the temporal relationships among events.
   *
   * @param {BidsTsvElement[]} elements - The elements representing the tsv file.
   * @private
   */
  _validateTemporal(elements) {
    // Check basic temporal conflicts such as Offset before Onset, or temporal tags with same def at same time.
    const eventManager = new EventManager()
    const [eventList, temporalIssues] = eventManager.parseEvents(elements)
    if (temporalIssues.length > 0) {
      this.errors.push(...temporalIssues)
      return
    }
    // There still may be non-temporal duplicates when multiple rows with the same onset.
    this._checkDuplicatesAcrossRows(elements)
    if (this.errors.length === 0) {
      this.errors.push(...eventManager.validate(eventList))
    }
  }

  /**
   * Check for duplicate tags when multiple rows with the same onset.
   *
   * ### Note:
   * Duplicate onsets are relatively rare and duplicates for single rows are checked when a ParsedHedString is
   * constructed.
   *
   * @param {BidsTsvElement[]} elements - The elements representing the tsv file.
   * @returns {BidsHedIssue[]} - Errors in temporal relationships among events.
   * @private
   */
  _checkDuplicatesAcrossRows(elements) {
    const duplicateMap = this._getOnsetMap(elements)
    for (const elementList of duplicateMap.values()) {
      if (elementList.length === 1) {
        continue
      }
      // Assemble the HED strings associated with same onset into single string. Use the parse duplicate detection.
      const rowString = elementList.map((element) => element.hedString).join(',')
      const [parsedString, errorIssues, warningIssues] = parseHedString(rowString, this.hedSchemas, false, false, true)
      const tsvLines = BidsTsvElement.getTsvLines(elementList)
      this.errors.push(...BidsHedIssue.fromHedIssues(errorIssues, this.tsvFile.file, { tsvLine: tsvLines }))
      this.warnings.push(...BidsHedIssue.fromHedIssues(warningIssues, this.tsvFile.file, { tsvLine: tsvLines }))
    }
  }

  /**
   * Get map of onsets to BidsTsvElements.
   *
   * @param {BidsTsvElement[]} elements - The elements representing the tsv file.
   * @returns {Map} - Map of onset value to a list of elements with that onset.
   * @private
   */
  _getOnsetMap(elements) {
    const onsetMap = new Map()
    for (const element of elements) {
      if (!element.hedString) {
        continue
      }
      if (onsetMap.has(element.onset)) {
        onsetMap.get(element.onset).push(element)
      } else {
        onsetMap.set(element.onset, [element])
      }
    }
    return onsetMap
  }

  /**
   * Top group tag requirements may not be satisfied until all splices have been done.
   *
   * @param {BidsTsvElement[]} elements - The elements to be checked.
   * @private
   */
  _checkNoTopTags(elements) {
    for (const element of elements) {
      const topTags = element.parsedHedString ? element.parsedHedString.topLevelTags : []
      const badTags = topTags.filter((tag) => ReservedChecker.hasTopLevelTagGroupAttribute(tag))
      if (badTags.length > 0) {
        this.errors.push(
          BidsHedIssue.fromHedIssue(
            generateIssue('invalidTopLevelTag', { tag: getTagListString(badTags), string: element.hedString }),
            element.file,
            { tsvLine: element.tsvLine },
          ),
        )
      }
    }
  }

  /**
   * Verify that this non-temporal file does not contain any temporal tags.
   *
   * @param {BidsTsvElement[]} elements - The elements representing a tsv file (with HED string parsed).
   */
  _checkNoTime(elements) {
    for (const element of elements) {
      if (element.parsedHedString.tags.some((tag) => this.reserved.timelineTags.has(tag.schemaTag.name))) {
        this.errors.push(
          BidsHedIssue.fromHedIssue(
            generateIssue('temporalTagInNonTemporalContext', { string: element.hedString, tsvLine: element.tsvLine }),
            this.tsvFile.file,
          ),
        )
      }
    }
  }
}

/**
 * Class that performs basic parsing and splicing.
 */
export class BidsHedTsvParser {
  static nullSet = new Set([null, undefined, '', 'n/a'])
  static braceRegEx = /\{([^{}]*?)\}/g

  /**
   * The BIDS TSV file being parsed.
   * @type {BidsTsvFile}
   */
  tsvFile

  /**
   * The HED schema collection being parsed against.
   * @type {Schemas}
   */
  hedSchemas

  /**
   * Constructor.
   *
   * @param {BidsTsvFile} tsvFile The BIDS TSV file being parsed.
   * @param {Schemas} hedSchemas The HED schema collection being parsed against.
   */
  constructor(tsvFile, hedSchemas) {
    this.tsvFile = tsvFile
    this.hedSchemas = hedSchemas
  }

  /**
   * Combine the BIDS sidecar HED data into a BIDS TSV file's HED data.
   *
   * @returns {Array} - Returns a two-element array [BidsTsvElement[], BidsHedIssue[], BidsHedIssue[]].
   */
  parse() {
    const tsvHedRows = this._generateHedRows()
    const tsvElements = this._parseHedRows(tsvHedRows)
    const [errors, warnings] = this._parseElementStrings(tsvElements)
    return [tsvElements, errors, warnings]
  }

  /**
   * Parse element HED strings.
   *
   * @param {BidsTsvElement[]} elements - The objects representing tsv rows with their parsed HEd strings.
   * @returns {Array} -  [BidsHedIssue[], BidsHedIssue[]] The errors and warnings resulting in creating the parsed HED strings.
   */
  _parseElementStrings(elements) {
    if (elements.length === 0) {
      return [[], []]
    }

    // Add the parsed HED strings to the elements and quite if there are serious errors
    const errors = []
    const warnings = []
    for (const element of elements) {
      const [parsedHedString, errorIssues, warningIssues] = parseHedString(
        element.hedString,
        this.hedSchemas,
        false,
        false,
        true,
      )
      element.parsedHedString = parsedHedString
      errors.push(...BidsHedIssue.fromHedIssues(errorIssues, this.tsvFile.file, { tsvLine: element.tsvLine }))
      warnings.push(...BidsHedIssue.fromHedIssues(warningIssues, this.tsvFile.file, { tsvLine: element.tsvLine }))
    }
    return [errors, warnings]
  }

  /**
   * Generate a list of rows with column-to-value mappings.
   *
   * @returns {Array} A list of single-row column-to-value mappings.
   * @private
   */
  _generateHedRows() {
    const tsvHedColumns = Array.from(this.tsvFile.parsedTsv.entries()).filter(
      ([header]) => this.tsvFile.mergedSidecar.hedData.has(header) || header === 'HED' || header === 'onset',
    )

    const tsvHedRows = []
    for (const [header, data] of tsvHedColumns) {
      data.forEach((value, index) => {
        tsvHedRows[index] ??= new Map()
        tsvHedRows[index].set(header, value)
      })
    }
    return tsvHedRows
  }

  /**
   * Parse the rows in the TSV file into HED strings.
   *
   * @param {Map[]} tsvHedRows - A list of single-row column-to-value mappings.
   * @returns {BidsTsvRow[]} - A list of row-based parsed HED strings.
   * @private
   */
  _parseHedRows(tsvHedRows) {
    const hedRows = []
    tsvHedRows.forEach((row, index) => {
      const hedRow = this._parseHedRow(row, index + 2)
      if (hedRow !== null) {
        hedRows.push(hedRow)
      }
    })
    return hedRows
  }

  /**
   * Parse a row in a TSV file into a BIDS row.
   *
   * @param {Map} rowCells - The column-to-value mapping for a single row.
   * @param {number} tsvLine - The index of this row in the TSV file.
   * @returns {BidsTsvRow} - A parsed HED string.
   * @private
   */
  _parseHedRow(rowCells, tsvLine) {
    const hedStringParts = []
    const columnMap = this._getColumnMapping(rowCells)
    this.spliceValues(columnMap)

    for (const [columnName, columnValue] of rowCells.entries()) {
      // If a splice, it can't be used in an assembled HED string.
      if (
        this.tsvFile.mergedSidecar.columnSpliceReferences.has(columnName) ||
        BidsHedTsvParser.nullSet.has(columnValue)
      ) {
        continue
      }
      if (columnMap.has(columnName) && !BidsHedTsvParser.nullSet.has(columnMap.get(columnName))) {
        hedStringParts.push(columnMap.get(columnName))
      }
    }
    const hedString = hedStringParts.join(',')
    if (hedString === '' || hedString === 'n/a') {
      return null
    }
    return new BidsTsvRow(hedString, this.tsvFile, tsvLine, rowCells)
  }

  /**
   * Generate a mapping from tsv columns to strings (may have splices in the strings)
   *
   * @param {Map} rowCells - The column-to-value mapping for a single row.
   * @returns {Map} - A mapping of column names to their corresponding parsed sidecar strings.
   * @private
   */
  _getColumnMapping(rowCells) {
    const columnMap = new Map()

    if (rowCells.has('HED')) {
      columnMap.set('HED', rowCells.get('HED'))
    }

    if (!this.tsvFile.mergedSidecar.hasHedData) {
      return columnMap
    }

    // Check for the columns with HED data in the sidecar
    for (const [columnName, columnValues] of this.tsvFile.mergedSidecar.parsedHedData.entries()) {
      if (!rowCells.has(columnName)) {
        columnMap.set(columnName, '')
        continue
      }
      const rowColumnValue = rowCells.get(columnName)
      if (rowColumnValue === 'n/a' || rowColumnValue === '') {
        columnMap.set(columnName, '')
        continue
      }

      if (columnValues instanceof ParsedHedString) {
        const columnString = columnValues.hedString.replace('#', rowColumnValue)
        columnMap.set(columnName, columnString)
      } else if (columnValues instanceof Map) {
        columnMap.set(columnName, columnValues.get(rowColumnValue)?.hedString)
      }
    }

    return columnMap
  }

  /**
   * Update the map to splice-in the values for columns that have splices.
   *
   * @param {Map} columnMap - Map of column name to HED string for a row.
   *
   * Note: Updates the map in place.
   */
  spliceValues(columnMap) {
    if (!(this.tsvFile.mergedSidecar?.columnSpliceMapping?.size > 0)) {
      return
    }
    // Only iterate over the column names that have splices
    for (const column of this.tsvFile.mergedSidecar.columnSpliceMapping.keys()) {
      // if (!columnMap.has(column)) {
      //   continue
      // }
      const unspliced = columnMap.get(column)

      const result = this._replaceSplices(unspliced, columnMap)
      //console.log(`Column ${column}: ${unspliced} => ${result}`)
      columnMap.set(column, result)
    }
  }

  /**
   * Replace a HED string containing slices with a resolved version for the column value in a row.
   *
   * @param {string} unspliced - A HED string possibly with unresolved splices.
   * @param {Map} columnMap - The map of column name to HED string for a row.
   * @returns {string} - The fully resolved HED string with no splices.
   * @private
   */
  _replaceSplices(unspliced, columnMap) {
    const result = unspliced.replace(BidsHedTsvParser.braceRegEx, (match, content) => {
      // Resolve the replacement value
      const resolved = columnMap.has(content) ? columnMap.get(content) : ''
      // Replace with resolved value or empty string if in nullSet
      return BidsHedTsvParser.nullSet.has(resolved) ? '' : resolved
    })
    return cleanupEmpties(result)
  }
}

export default BidsHedTsvValidator