Source code for hedweb.columns
"""
Extracts column information from a file and provides utilities for working with columns in a web form.
"""
import os
import openpyxl
from hed.errors import HedFileError
from hed.tools.analysis.tabular_summary import TabularSummary
from pandas import DataFrame, read_csv
from hedweb.constants import base_constants as bc
from hedweb.constants import file_constants as fc
from hedweb.web_util import form_has_file, form_has_option
[docs]
def create_column_selections(form_dict) -> tuple[list[str], list[str]]:
"""Return a tag prefix dictionary from a form dictionary.
Parameters:
form_dict (dict): The column prefix table returned from a form.
Returns:
tuple[list[str], list[str]]: list of value columns and a list of skipped columns.
"""
columns = []
columns_selected = []
columns_categorical = []
keys = form_dict.keys()
for key in keys:
if not key.startswith("column_") or not key.endswith("_name"):
continue
pieces = key.split("_")
col_name = form_dict[key]
columns.append(col_name)
if "column_" + pieces[1] + "_use" in keys:
columns_selected.append(col_name)
if "column_" + pieces[1] + "_category" in keys:
columns_categorical.append(col_name)
columns_value = list(set(columns_selected).difference(set(columns_categorical)))
columns_skip = list(set(columns).difference(set(columns_selected)))
return columns_value, columns_skip
[docs]
def get_tag_columns(form_dict) -> list[str]:
"""Return the tag column names selected from a form.
Parameters:
form_dict (dict): The column names table.
Returns:
list[str]: List of tag columns
"""
tag_columns = []
keys = form_dict.keys()
for key in keys:
if not key.startswith("column_") or not key.endswith("_use"):
continue
pieces = key.split("_")
column_name_key = "column_" + pieces[1] + "_name"
if column_name_key in keys and form_dict[column_name_key]:
tag_columns.append(form_dict[column_name_key])
return tag_columns
def _create_columns_info(columns_file, has_column_names=True, sheet_name=None) -> dict:
"""Create a dictionary of column information from a spreadsheet.
Parameters:
columns_file (File-like): File to create the dictionary for.
has_column_names (bool): If True, first row is interpreted as the column names.
sheet_name (str): The name of the worksheet if this is an Excel file.
Returns:
dict: Dictionary containing information include column names and number of unique values in each column.
Raises:
HedFileError: If the file does not have the either an Excel or text file extension.
"""
header = None
if has_column_names:
header = 0
sheet_names = None
filename = columns_file.filename
file_ext = os.path.splitext(filename.lower())[1]
if file_ext in fc.EXCEL_FILE_EXTENSIONS:
worksheet, sheet_names = _get_worksheet(columns_file, sheet_name)
dataframe = dataframe_from_worksheet(worksheet, has_column_names)
sheet_name = worksheet.title
elif file_ext in fc.TEXT_FILE_EXTENSIONS:
dataframe = read_csv(columns_file, delimiter="\t", header=header)
else:
raise HedFileError(
"BadFileExtension",
f"File {filename} extension does not correspond to an Excel or tsv file",
"",
)
col_list = list(dataframe.columns)
col_dict = TabularSummary()
col_dict.update(dataframe)
col_counts = col_dict.get_number_unique()
columns_info = {
bc.COLUMNS_FILE: filename,
bc.COLUMN_LIST: col_list,
bc.COLUMN_COUNTS: col_counts,
bc.WORKSHEET_SELECTED: sheet_name,
bc.WORKSHEET_NAMES: sheet_names,
}
return columns_info
[docs]
def dataframe_from_worksheet(worksheet, has_column_names) -> DataFrame:
"""Return a pandas data frame from an Excel worksheet.
Parameters:
worksheet (Worksheet): A single worksheet of an Excel file.
has_column_names (bool): If True, interpret the first row as column names.
Returns:
DataFrame: The data represented in the worksheet.
"""
if not has_column_names:
data_frame = DataFrame(worksheet.values)
else:
data = worksheet.values
# first row is columns
cols = next(data)
data = list(data)
data_frame = DataFrame(data, columns=cols)
return data_frame
[docs]
def get_columns_request(request) -> dict:
"""Create a columns info dictionary based on the request.
Parameters:
request (Request): The Request object from which to extract the information.
Returns:
dict: The dictionary with the column names.
Raises:
HedFileError: If the file is missing or has a bad extension.
"""
if not form_has_file(request.files, bc.COLUMNS_FILE):
raise HedFileError("MissingFile", "An uploadable file was not provided", None)
columns_file = request.files.get(bc.COLUMNS_FILE, "")
has_column_names = form_has_option(request.form, "has_column_names", "on")
sheet_name = request.form.get(bc.WORKSHEET_SELECTED, None)
return _create_columns_info(columns_file, has_column_names, sheet_name)
[docs]
def get_column_numbers(form_dict) -> list:
"""Return a tag prefix dictionary from a form dictionary.
Parameters:
form_dict (dict): The dictionary returned from a form that contains a column table.
Returns:
list: List of selected columns.
Note: The form counts columns starting from 1.
"""
tag_columns = []
keys = form_dict.keys()
for key in keys:
index_check = key.rfind("_check")
if index_check == -1 or form_dict[key] != "on":
continue
pieces = key.split("_")
column_number = int(pieces[1])
tag_columns.append(column_number)
return tag_columns
def _get_worksheet(
excel_file, sheet_name
) -> tuple[openpyxl.worksheet.worksheet.Worksheet, list[str]]:
"""Return a Worksheet and a list of sheet names from an Excel file.
Parameters:
excel_file (str): Name of the Excel file to use.
sheet_name (str or None): Name of the worksheet if any, otherwise the first one.
Returns:
tuple[Worksheet, list[str]]: The worksheet and a list of sheet names.
Raises:
HedFileError: If the file does not have a valid Excel extension or the sheet name is not found.
"""
wb = openpyxl.load_workbook(excel_file, read_only=True)
sheet_names = wb.sheetnames
if not sheet_names:
raise HedFileError("BadExcelFile", "Excel files must have worksheets", None)
if sheet_name and sheet_name not in sheet_names:
raise HedFileError(
"BadWorksheetName", f"Worksheet {sheet_name} not in Excel file", ""
)
if sheet_name:
worksheet = wb[sheet_name]
else:
worksheet = wb.worksheets[0]
return worksheet, sheet_names