Source code for hed.models.base_input

"""
Superclass representing a basic columnar file.
"""

import os
from typing import Union
import openpyxl
import pandas as pd
from hed.models.column_metadata import ColumnMetadata
from hed.models.definition_dict import DefinitionDict
from hed.models.column_mapper import ColumnMapper
from hed.errors.exceptions import HedFileError, HedExceptions

from hed.models.df_util import _handle_curly_braces_refs, filter_series_by_onset



[docs]
class BaseInput:
    """Superclass representing a basic columnar file."""

    TEXT_EXTENSION = [".tsv", ".txt"]
    EXCEL_EXTENSION = [".xlsx"]


[docs]
    def __init__(
        self, file, file_type=None, worksheet_name=None, has_column_names=True, mapper=None, name=None, allow_blank_names=True
    ):
        """Constructor for the BaseInput class.

        Parameters:
            file (str or file-like or pd.Dataframe): An xlsx/tsv file to open.
            file_type (str or None): ".xlsx" (Excel), ".tsv" or ".txt" (tab-separated text).
                Derived from file if file is a filename.  Ignored if pandas dataframe.
            worksheet_name (str or None): Name of Excel workbook worksheet name to use.
                (Not applicable to tsv files.)
            has_column_names (bool): True if file has column names.
                This value is ignored if you pass in a pandas dataframe.
            mapper (ColumnMapper or None):  Indicates which columns have HED tags.
                See SpreadsheetInput or TabularInput for examples of how to use built-in a ColumnMapper.
            name (str or None): Optional field for how this file will report errors.
            allow_blank_names(bool): If True, column names can be blank

        Raises:
             HedFileError: For various issues.

        Notes: Reasons for raising HedFileError include:
            - file is blank.
            - An invalid dataframe was passed with size 0.
            - An invalid extension was provided.
            - A duplicate or empty column name appears.
            - Cannot open the indicated file.
            - The specified worksheet name does not exist.
            - If the sidecar file or tabular file had invalid format and could not be read.

        """
        if mapper is None:
            mapper = ColumnMapper()
        self._mapper = mapper
        self._has_column_names = has_column_names
        self._name = name
        # This is the loaded workbook if we loaded originally from an Excel file.
        self._loaded_workbook = None
        self._worksheet_name = worksheet_name
        self._dataframe = None

        input_type = file_type
        if isinstance(file, str):
            if file_type is None:
                _, input_type = os.path.splitext(file)
            if self.name is None:
                self._name = file

        self._open_dataframe_file(file, has_column_names, input_type)

        column_issues = ColumnMapper.check_for_blank_names(self.columns, allow_blank_names=allow_blank_names)
        if column_issues:
            raise HedFileError(
                HedExceptions.BAD_COLUMN_NAMES,
                "Duplicate or blank columns found. See issues.",
                self.name,
                issues=column_issues,
            )

        self.reset_mapper(mapper)



[docs]
    def reset_mapper(self, new_mapper):
        """Set mapper to a different view of the file.

        Parameters:
            new_mapper (ColumnMapper): A column mapper to be associated with this base input.
        """
        self._mapper = new_mapper
        if not self._mapper:
            self._mapper = ColumnMapper()

        if self._dataframe is not None and self._has_column_names:
            columns = self._dataframe.columns
            self._mapper.set_column_map(columns)


    @property
    def dataframe(self):
        """The underlying dataframe."""
        return self._dataframe

    @property
    def dataframe_a(self) -> pd.DataFrame:
        """Return the assembled dataframe Probably a placeholder name.

        Returns:
            pd.Dataframe: the assembled dataframe
        """
        return self.assemble()

    @property
    def series_a(self) -> pd.Series:
        """Return the assembled dataframe as a series.

        Returns:
            pd.Series: the assembled dataframe with columns merged.
        """

        return self.combine_dataframe(self.assemble())

    @property
    def series_filtered(self) -> Union[pd.Series, None]:
        """Return the assembled dataframe as a series, with rows that have the same onset combined.

        Returns:
            Union[pd.Series, None]: the assembled dataframe with columns merged, and the rows filtered together.
        """
        if self.onsets is not None:
            return filter_series_by_onset(self.series_a, self.onsets)
        return None

    @property
    def onsets(self):
        """Return the onset column if it exists."""
        if "onset" in self.columns:
            return self._dataframe["onset"]
        return None

    @property
    def needs_sorting(self) -> bool:
        """Return True if this both has an onset column, and it needs sorting."""
        onsets = self.onsets
        if onsets is not None:
            onsets = pd.to_numeric(self.dataframe["onset"], errors="coerce")
            return not onsets.is_monotonic_increasing
        else:
            return False

    @property
    def name(self) -> str:
        """Name of the data."""
        return self._name

    @property
    def has_column_names(self) -> bool:
        """True if dataframe has column names."""
        return self._has_column_names

    @property
    def loaded_workbook(self):
        """The underlying loaded workbooks."""
        return self._loaded_workbook

    @property
    def worksheet_name(self):
        """The worksheet name."""
        return self._worksheet_name


[docs]
    def convert_to_form(self, hed_schema, tag_form):
        """Convert all tags in underlying dataframe to the specified form.

        Parameters:
            hed_schema (HedSchema): The schema to use to convert tags.
            tag_form (str): HedTag property to convert tags to.
                Most cases should use convert_to_short or convert_to_long below.
        """
        from hed.models.df_util import convert_to_form

        convert_to_form(self._dataframe, hed_schema, tag_form, self._mapper.get_tag_columns())



[docs]
    def convert_to_short(self, hed_schema):
        """Convert all tags in underlying dataframe to short form.

        Parameters:
            hed_schema (HedSchema): The schema to use to convert tags.

        """
        self.convert_to_form(hed_schema, "short_tag")



[docs]
    def convert_to_long(self, hed_schema):
        """Convert all tags in underlying dataframe to long form.

        Parameters:
            hed_schema (HedSchema or None): The schema to use to convert tags.
        """
        self.convert_to_form(hed_schema, "long_tag")



[docs]
    def shrink_defs(self, hed_schema):
        """Shrinks any def-expand found in the underlying dataframe.

        Parameters:
            hed_schema (HedSchema or None): The schema to use to identify defs.
        """
        from df_util import shrink_defs

        shrink_defs(self._dataframe, hed_schema=hed_schema, columns=self._mapper.get_tag_columns())



[docs]
    def expand_defs(self, hed_schema, def_dict):
        """Shrinks any def-expand found in the underlying dataframe.

        Parameters:
            hed_schema (HedSchema or None): The schema to use to identify defs.
            def_dict (DefinitionDict): The definitions to expand.
        """
        from df_util import expand_defs

        expand_defs(self._dataframe, hed_schema=hed_schema, def_dict=def_dict, columns=self._mapper.get_tag_columns())



[docs]
    def to_excel(self, file):
        """Output to an Excel file.

        Parameters:
            file (str or file-like): Location to save this base input.

        Raises:
            ValueError: If empty file object was passed.
            OSError: If the file cannot be opened.
        """
        if not file:
            raise ValueError("Empty file name or object passed in to BaseInput.save.")

        dataframe = self._dataframe
        if self._loaded_workbook:
            old_worksheet = self.get_worksheet(self._worksheet_name)
            # Excel spreadsheets are 1 based, then add another 1 for column names if present
            adj_row_for_col_names = 1
            if self._has_column_names:
                adj_row_for_col_names += 1
            adj_for_one_based_cols = 1
            for row_number, text_file_row in dataframe.iterrows():
                for column_number, _column_text in enumerate(text_file_row):
                    cell_value = dataframe.iloc[row_number, column_number]
                    old_worksheet.cell(row_number + adj_row_for_col_names, column_number + adj_for_one_based_cols).value = (
                        cell_value
                    )

            self._loaded_workbook.save(file)
        else:
            dataframe.to_excel(file, header=self._has_column_names)



[docs]
    def to_csv(self, file=None) -> Union[str, None]:
        """Write to file or return as a string.

        Parameters:
            file (str, file-like, or None): Location to save this file. If None, return as string.

        Returns:
            Union[str, None]:  None if file is given or the contents as a str if file is None.

        Raises:
            OSError: If the file cannot be opened.
        """
        dataframe = self._dataframe
        csv_string_if_filename_none = dataframe.to_csv(file, sep="\t", index=False, header=self._has_column_names)
        return csv_string_if_filename_none


    @property
    def columns(self) -> list[str]:
        """Returns a list of the column names.

            Empty if no column names.

        Returns:
            list: The column names.
        """
        columns = []
        if self._dataframe is not None and self._has_column_names:
            columns = list(self._dataframe.columns)
        return columns


[docs]
    def column_metadata(self) -> dict[int, "ColumnMetadata"]:
        """Return the metadata for each column.

        Returns:
            dict[int, ColumnMetadata]: Number/ColumnMetadata pairs.
        """
        if self._mapper:
            return self._mapper._final_column_map
        return {}



[docs]
    def set_cell(self, row_number, column_number, new_string_obj, tag_form="short_tag"):
        """Replace the specified cell with transformed text.

        Parameters:
            row_number (int):    The row number of the spreadsheet to set.
            column_number (int): The column number of the spreadsheet to set.
            new_string_obj (HedString): Object with text to put in the given cell.
            tag_form (str): Version of the tags (short_tag, long_tag, base_tag, etc.)

        Notes:
             Any attribute of a HedTag that returns a string is a valid value of tag_form.

        Raises:
            ValueError: If there is not a loaded dataframe.
            KeyError: If the indicated row/column does not exist.
            AttributeError: If the indicated tag_form is not an attribute of HedTag.
        """
        if self._dataframe is None:
            raise ValueError("No data frame loaded")

        new_text = new_string_obj.get_as_form(tag_form)
        self._dataframe.iloc[row_number, column_number] = new_text



[docs]
    def get_worksheet(self, worksheet_name=None) -> Union[openpyxl.workbook.Workbook, None]:
        """Get the requested worksheet.

        Parameters:
            worksheet_name (str or None): The name of the requested worksheet by name or the first one if None.

        Returns:
            Union[openpyxl.workbook.Workbook, None]: The workbook request.

        Notes:
            If None, returns the first worksheet.

        Raises:
            KeyError: If the specified worksheet name does not exist.
        """
        if worksheet_name and self._loaded_workbook:
            # return self._loaded_workbook.get_sheet_by_name(worksheet_name)
            return self._loaded_workbook[worksheet_name]
        elif self._loaded_workbook:
            return self._loaded_workbook.worksheets[0]
        else:
            return None


    @staticmethod
    def _get_dataframe_from_worksheet(worksheet, has_headers) -> pd.DataFrame:
        """Create a dataframe from the worksheet.

        Parameters:
            worksheet (Worksheet): The loaded worksheet to convert.
            has_headers (bool): True if this worksheet has column headers.

        Returns:
            pd.DataFrame: The converted data frame.

        """
        if has_headers:
            data = worksheet.values
            # first row is columns
            cols = next(data)
            data = list(data)
            return pd.DataFrame(data, columns=cols, dtype=str)
        else:
            return pd.DataFrame(worksheet.values, dtype=str)


[docs]
    def validate(self, hed_schema, extra_def_dicts=None, name=None, error_handler=None) -> list[dict]:
        """Creates a SpreadsheetValidator and returns all issues with this file.

        Parameters:
            hed_schema (HedSchema): The schema to use for validation.
            extra_def_dicts (list of DefDict or DefDict): All definitions to use for validation.
            name (str): The name to report errors from this file as.
            error_handler (ErrorHandler): Error context to use.  Creates a new one if None.

        Returns:
            list[dict]: A list of issues for a HED string.
        """
        from hed.validator.spreadsheet_validator import SpreadsheetValidator

        if not name:
            name = self.name
        tab_validator = SpreadsheetValidator(hed_schema)
        validation_issues = tab_validator.validate(
            self, self._mapper.get_def_dict(hed_schema, extra_def_dicts), name, error_handler=error_handler
        )
        return validation_issues


    @staticmethod
    def _dataframe_has_names(dataframe) -> bool:
        for column in dataframe.columns:
            if isinstance(column, str):
                return True
        return False


[docs]
    def assemble(self, mapper=None, skip_curly_braces=False) -> pd.DataFrame:
        """Assembles the HED strings.

        Parameters:
            mapper (ColumnMapper or None): Generally pass none here unless you want special behavior.
            skip_curly_braces (bool): If True, don't plug in curly brace values into columns.

        Returns:
            pd.Dataframe: The assembled dataframe.
        """
        if mapper is None:
            mapper = self._mapper

        all_columns = self._handle_transforms(mapper)
        if skip_curly_braces:
            return all_columns
        transformers, _ = mapper.get_transformers()
        refs = self.get_column_refs()
        column_names = list(transformers)
        return _handle_curly_braces_refs(all_columns, refs, column_names)


    def _handle_transforms(self, mapper) -> pd.DataFrame:
        """Apply transformations to the dataframe using the provided mapper.

        Parameters:
            mapper: The column mapper object containing transformation functions.

        Returns:
            pd.DataFrame: The transformed dataframe with all transformations applied.

        Notes:
            - Handles categorical column conversions before and after transformations
            - Returns original dataframe if no transformers are defined
            - Categorical columns are temporarily converted to 'category' type for processing
              then converted back to 'str' type after transformation
        """
        transformers, need_categorical = mapper.get_transformers()
        if transformers:
            all_columns = self._dataframe
            if need_categorical:
                all_columns[need_categorical] = all_columns[need_categorical].astype("category")

            all_columns = all_columns.transform(transformers)

            if need_categorical:
                all_columns[need_categorical] = all_columns[need_categorical].astype("str")
        else:
            all_columns = self._dataframe

        return all_columns


[docs]
    @staticmethod
    def combine_dataframe(dataframe) -> pd.Series:
        """Combine all columns in the given dataframe into a single HED string series,
            skipping empty columns and columns with empty strings.

        Parameters:
            dataframe (pd.Dataframe): The dataframe to combine

        Returns:
            pd.Series: The assembled series.
        """
        dataframe = dataframe.apply(lambda x: ", ".join(filter(lambda e: bool(e) and e != "n/a", map(str, x))), axis=1)
        return dataframe



[docs]
    def get_def_dict(self, hed_schema, extra_def_dicts=None) -> "DefinitionDict":
        """Return the definition dict for this file.

        Note: Baseclass implementation returns just extra_def_dicts.

        Parameters:
            hed_schema (HedSchema): Identifies tags to find definitions(if needed).
            extra_def_dicts (list, DefinitionDict, or None): Extra dicts to add to the list.

        Returns:
            DefinitionDict:   A single definition dict representing all the data(and extra def dicts).
        """
        from hed.models.definition_dict import DefinitionDict

        return DefinitionDict(extra_def_dicts, hed_schema)



[docs]
    def get_column_refs(self) -> list:
        """Return a list of column refs for this file.

            Default implementation returns empty list.

        Returns:
           list: A list of unique column refs found.
        """
        return []


    def _open_dataframe_file(self, file, has_column_names, input_type):
        """Load data from various file types into the internal DataFrame.

        This method handles loading data from different file formats including Excel files,
        text files (TSV/CSV), and existing pandas DataFrames. It sets the _dataframe property
        and handles appropriate type conversions and error handling for each file type.

        Parameters:
            file (str, file-like, or pd.DataFrame): The input data source.
                - str: File path to load from
                - file-like: File object to read from
                - pd.DataFrame: Existing DataFrame to use directly
            has_column_names (bool): Whether the file contains column headers.
                Used to determine pandas header parameter for text files.
            input_type (str): File extension indicating the file type.
                Supported types: '.xlsx' (Excel), '.tsv', '.txt' (tab-separated text).

        Raises:
            HedFileError:
                - If file is empty or None (FILE_NOT_FOUND)
                - If unsupported file extension provided (INVALID_EXTENSION)
                - If file loading fails due to format issues (INVALID_FILE_FORMAT)

        Notes:
            - For DataFrame input: Converts to string type and auto-detects column names
            - For Excel files: Loads workbook and converts specified worksheet to DataFrame
            - For text files: Uses pandas read_csv with tab delimiter and handles empty files
            - All loaded data is converted to string type for consistency
            - NaN values in text files are replaced with "n/a"
        """
        pandas_header = 0 if has_column_names else None

        # If file is already a DataFrame
        if isinstance(file, pd.DataFrame):
            self._dataframe = file.astype(str)
            self._has_column_names = self._dataframe_has_names(self._dataframe)
            return

        # Check for empty file or None
        if not file:
            raise HedFileError(HedExceptions.FILE_NOT_FOUND, "Empty file specification passed to BaseInput.", file)

        # Handle Excel file input
        if input_type in self.EXCEL_EXTENSION:
            self._load_excel_file(file, has_column_names)
            return

        # Handle unsupported file extensions
        if input_type not in self.TEXT_EXTENSION:
            raise HedFileError(HedExceptions.INVALID_EXTENSION, "Unsupported file extension for text files.", self.name)

        # Handle text file input (CSV/TSV)
        self._load_text_file(file, pandas_header)

    def _load_excel_file(self, file, has_column_names):
        """Load an Excel file into a pandas DataFrame.

        This method loads an Excel workbook using openpyxl, retrieves the specified
        worksheet (or the first one if none specified), and converts it to a pandas
        DataFrame. The loaded workbook is stored for potential later use in saving.

        Parameters:
            file (str or file-like): Path to the Excel file or file-like object to load.
                Must be a valid Excel file format (.xlsx).
            has_column_names (bool): Whether the first row of the worksheet contains
                column headers that should be used as DataFrame column names.

        Raises:
            HedFileError: If loading fails due to file format issues, missing file,
                corrupted Excel file, or any other openpyxl-related errors.
                The original exception is chained for debugging purposes.

        Notes:
            - Uses openpyxl library for Excel file handling
            - Stores the loaded workbook in self._loaded_workbook for later use
            - Retrieves worksheet using self._worksheet_name (or first sheet if None)
            - Converts worksheet data to DataFrame using _get_dataframe_from_worksheet
            - All data is converted to string type for consistency
        """
        try:
            self._loaded_workbook = openpyxl.load_workbook(file)
            loaded_worksheet = self.get_worksheet(self._worksheet_name)
            self._dataframe = self._get_dataframe_from_worksheet(loaded_worksheet, has_column_names)
        except Exception as e:
            raise HedFileError(HedExceptions.INVALID_FILE_FORMAT, f"Failed to load Excel file: {str(e)}", self.name) from e

    def _load_text_file(self, file, pandas_header):
        """Load a text file (TSV/CSV) into a pandas DataFrame.

        This method handles loading tab-separated value files and other text-based
        formats using pandas read_csv. It includes special handling for empty files,
        proper NaN value replacement, and comprehensive error handling.

        Parameters:
            file (str or file-like): Path to the text file or file-like object to load.
                Can be any format supported by pandas read_csv with tab delimiter.
            pandas_header (int or None): Row number to use as column headers.
                - 0: First row contains headers
                - None: No header row, generate default column names

        Raises:
            HedFileError: If loading fails due to file format issues, encoding problems,
                or any other pandas-related errors. The original exception is chained
                for debugging purposes.

        Notes:
            - Uses tab delimiter for parsing (appropriate for .tsv files)
            - Handles empty files by creating an empty DataFrame
            - Converts all data to string type for consistency
            - Replaces NaN values with "n/a" for consistent handling
            - Skips blank lines during parsing
            - Uses specific na_values configuration ("", "null")
            - Handles pandas.errors.EmptyDataError for files with no data
        """
        if isinstance(file, str) and os.path.exists(file) and os.path.getsize(file) == 0:
            self._dataframe = pd.DataFrame()  # Handle empty file
            return

        try:
            self._dataframe = pd.read_csv(
                file,
                delimiter="\t",
                header=pandas_header,
                skip_blank_lines=True,
                dtype=str,
                keep_default_na=True,
                na_values=("", "null"),
            )
            # Replace NaN values with a known value
            self._dataframe = self._dataframe.fillna("n/a")
        except pd.errors.EmptyDataError:
            self._dataframe = pd.DataFrame()  # Handle case where file has no data
        except Exception as e:
            raise HedFileError(HedExceptions.INVALID_FILE_FORMAT, f"Failed to load text file: {str(e)}", self.name) from e