"""Data handling utilities involving dataframes."""
import pandas as pd
import numpy as np
from hed.errors.exceptions import HedFileError
[docs]
def add_columns(df, column_list, value="n/a"):
"""Add specified columns to df if not there.
Parameters:
df (DataFrame): Pandas dataframe.
column_list (list): List of columns to append to the dataframe.
value (str): Default fill value for the column.
"""
add_cols = list(set(column_list) - set(df))
for col in add_cols:
df[col] = value
[docs]
def check_match(ds1, ds2, numeric=False):
"""Check two Pandas data series have the same values.
Parameters:
ds1 (DataSeries): Pandas data series to check.
ds2 (DataSeries): Pandas data series to check.
numeric (bool): If True, treat as numeric and do close-to comparison.
Returns:
list: Error messages indicating the mismatch or empty if the series match.
"""
if len(ds1.index) != len(ds2.index):
return f"First series has length {len(ds1.index)} and {len(ds2.index)} events"
if numeric:
close_test = np.isclose(
pd.to_numeric(ds1, errors="coerce"), pd.to_numeric(ds2, errors="coerce"), equal_nan=True
)
if sum(np.logical_not(close_test)):
return f"Series differ at positions {list(ds1.loc[np.logical_not(close_test)].index)}"
else:
unequal = ds1.map(str) != ds2.map(str)
if sum(unequal) > 0:
return f"Series differ at positions {list(ds1.loc[unequal].index)}"
return []
[docs]
def delete_columns(df, column_list):
"""Delete the specified columns from a dataframe.
Parameters:
df (DataFrame): Pandas dataframe from which to delete columns.
column_list (list): List of candidate column names for deletion.
Notes:
- The deletion of columns is done in place.
- This does not raise an error if df does not have a column in the list.
"""
delete_cols = list(set(column_list).intersection(set(df)))
df.drop(columns=delete_cols, inplace=True)
[docs]
def delete_rows_by_column(df, value, column_list=None):
"""Delete rows where columns have this value.
Parameters:
df (DataFrame): Pandas dataframe from which to delete rows.
value (str): Specified value to indicate row should be deleted.
column_list (list): List of columns to search for value.
Notes:
- All values are converted to string before testing.
- Deletion is done in place.
"""
if column_list:
cols = list(set(column_list).intersection(set(df)))
else:
cols = list(df)
for col in cols:
map_col = df[col].map(str) == str(value)
df.drop(df[map_col].index, axis=0, inplace=True)
[docs]
def get_eligible_values(values, values_included):
"""Return a list of the items from values that are in values_included or None if no values_included.
Parameters:
values (list): List of strings against which to test.
values_included (list): List of items to be selected from values if they are present.
Returns:
list: list of selected values or None if values_included is empty or None.
"""
if values_included:
eligible_columns = [x for x in values_included if x in frozenset(values)]
else:
eligible_columns = None
return eligible_columns
[docs]
def get_key_hash(key_tuple):
"""Calculate a hash key for tuple of values.
Parameters:
key_tuple (tuple, list): The key values in the correct order for lookup.
Returns:
int: A hash key for the tuple.
"""
return hash(tuple((str(n) for n in key_tuple)))
[docs]
def get_new_dataframe(data):
"""Get a new dataframe representing a tsv file.
Parameters:
data (DataFrame or str): DataFrame or filename representing a tsv file.
Returns:
DataFrame: A dataframe containing the contents of the tsv file or if data was
a DataFrame to start with, a new copy of the DataFrame.
:raises HedFileError:
- A filename is given, and it cannot be read into a Dataframe.
"""
if isinstance(data, str):
df = pd.read_csv(data, delimiter="\t", header=0, keep_default_na=True, na_values=[",", "null"])
elif isinstance(data, pd.DataFrame):
df = data.copy()
else:
raise HedFileError("BadDataFrame", "get_new_dataframe could not extract DataFrame from data", "")
return df
[docs]
def get_row_hash(row, key_list):
"""Get a hash key from key column values for row.
Parameters:
row (DataSeries) A Pandas data series corresponding to a row in a spreadsheet.
key_list (list) List of column names to create the hash value from.
Returns:
str: Hash key constructed from the entries of row in the columns specified by key_list.
:raises HedFileError:
- If row doesn't have all the columns in key_list HedFileError is raised.
"""
columns_present, columns_missing = separate_values(list(row.index.values), key_list)
if columns_missing:
raise HedFileError("lookup_row", f"row must have all keys, missing{str(columns_missing)}", "")
new_row = row[key_list].fillna("n/a").astype(str)
return get_key_hash(new_row)
[docs]
def get_value_dict(tsv_path, key_col="file_basename", value_col="sampling_rate"):
"""Get a dictionary of two columns of a dataframe.
Parameters:
tsv_path (str): Path to a tsv file with a header row to be read into a DataFrame.
key_col (str): Name of the column which should be the key.
value_col (str): Name of the column which should be the value.
Returns:
dict: Dictionary with key_col values as the keys and the corresponding value_col values as the values.
Raises:
HedFileError: When tsv_path does not correspond to a file that can be read into a DataFrame.
"""
value_dict = {}
df = get_new_dataframe(tsv_path)
for _index, row in df.iterrows():
if row[key_col] in value_dict:
raise HedFileError("DuplicateKeyInValueDict", "The key column must have unique values", "")
value_dict[row[key_col]] = row[value_col]
return value_dict
[docs]
def make_info_dataframe(col_info, selected_col):
"""Get a dataframe from selected columns.
Parameters:
col_info (dict): Dictionary of dictionaries of column values and counts.
selected_col (str): Name of the column used as top level key for col_info.
Returns:
dataframe: A two-column dataframe with first column containing values from the
dictionary whose key is selected_col and whose second column are the corresponding counts.
The returned value is None if selected_col is not a top-level key in col_info.
"""
col_dict = col_info.get(selected_col, None)
if not col_dict:
return None
col_values = col_dict.keys()
df = pd.DataFrame(sorted(col_values), columns=[selected_col])
return df
[docs]
def replace_na(df):
"""Replace (in place) the n/a with np.nan taking care of categorical columns."""
for column in df.columns:
if df[column].dtype.name != "category":
df[column] = df[column].replace("n/a", np.nan)
elif "n/a" in df[column].cat.categories:
df[column] = df[column].astype("object")
df[column] = df[column].replace("n/a", np.nan)
df[column] = pd.Categorical(df[column])
[docs]
def replace_values(df, values=None, replace_value="n/a", column_list=None):
"""Replace string values in specified columns.
Parameters:
df (DataFrame): Dataframe whose values will be replaced.
values (list, None): List of strings to replace. If None, only empty strings are replaced.
replace_value (str): String replacement value.
column_list (list, None): List of columns in which to do replacement. If None all columns are processed.
Returns:
int: number of values replaced.
"""
num_replaced = 0
if column_list:
cols = list(set(column_list).intersection(set(df)))
else:
cols = list(df)
if not values:
values = [""]
for col in cols:
for value in values:
value_mask = df[col].map(str) == str(value)
num_replaced += sum(value_mask)
index = df[value_mask].index
df.loc[index, col] = replace_value
return num_replaced
[docs]
def reorder_columns(data, col_order, skip_missing=True):
"""Create a new dataframe with columns reordered.
Parameters:
data (DataFrame, str): Dataframe or filename of dataframe whose columns are to be reordered.
col_order (list): List of column names in desired order.
skip_missing (bool): If true, col_order columns missing from data are skipped, otherwise error.
Returns:
DataFrame: A new reordered dataframe.
Raises:
HedFileError: If col_order contains columns not in data and skip_missing is False.
If data corresponds to a filename from which a dataframe cannot be created.
"""
df = get_new_dataframe(data)
present_cols, missing_cols = separate_values(df.columns.values.tolist(), col_order)
if missing_cols and not skip_missing:
raise HedFileError("MissingKeys", f"Events file must have columns {str(missing_cols)}", "")
df = df[present_cols]
return df
[docs]
def separate_values(values, target_values):
"""Get target values from the target_values list.
Parameters:
values (list): List of values to be tested.
target_values (list): List of desired values.
Returns:
tuple[list, list]: A tuple containing two lists:
- Target values present in values.
- Target values missing from values.
Notes:
- The function computes the set difference of target_cols and base_cols and returns a list
of columns of target_cols that are in base_cols and a list of those missing.
"""
if not target_values:
return [], []
elif not values:
return [], target_values
present_values = [x for x in target_values if x in frozenset(values)]
missing_values = list(set(target_values).difference(set(values)))
return present_values, missing_values