Source code for remodeler.operations.factor_column_op
"""Append to tabular file columns of factors based on column values."""
import pandas as pd
from remodeler.operations.base_op import BaseOp
[docs]
class FactorColumnOp(BaseOp):
"""Append to tabular file columns of factors based on column values.
Required remodeling parameters:
- **column_name** (*str*): The name of a column in the DataFrame to compute factors from.
Optional remodeling parameters
- **factor_names** (*list*): Names to use as the factor columns.
- **factor_values** (*list*): Values in the column column_name to create factors for.
Notes:
- If no factor_values are provided, factors are computed for each of the unique values in column_name column.
- If factor_names are provided, then factor_values must also be provided and the two lists be the same size.
"""
NAME = "factor_column"
PARAMS = {
"type": "object",
"properties": {
"column_name": {
"type": "string",
"description": "Name of the column for which to create one-hot factors for unique values.",
},
"factor_names": {
"type": "array",
"description": "Names of the resulting factor columns. If given must be same length as factor_values",
"items": {"type": "string"},
"minItems": 1,
"uniqueItems": True,
},
"factor_values": {
"type": "array",
"description": "Specific unique column values to compute factors for (otherwise all unique values).",
"items": {"type": "string"},
"minItems": 1,
"uniqueItems": True,
},
},
"required": ["column_name"],
"dependentRequired": {"factor_names": ["factor_values"]},
"additionalProperties": False,
}
[docs]
def __init__(self, parameters):
"""Constructor for the factor column operation.
Parameters:
parameters (dict): Parameter values for required and optional parameters.
"""
super().__init__(parameters)
self.column_name = parameters["column_name"]
self.factor_values = parameters.get("factor_values", None)
self.factor_names = parameters.get("factor_names", None)
[docs]
def do_op(self, dispatcher, df, name, sidecar=None) -> pd.DataFrame:
"""Create factor columns based on values in a specified column.
Parameters:
dispatcher (Dispatcher): Manages the operation I/O.
df (DataFrame): The DataFrame to be remodeled.
name (str): Unique identifier for the dataframe -- often the original file path.
sidecar (Sidecar or file-like): Not needed for this operation.
Returns:
DataFrame: A new DataFrame with the factor columns appended.
"""
factor_values = self.factor_values
factor_names = self.factor_names
if len(factor_values) == 0:
factor_values = df[self.column_name].unique()
factor_names = [self.column_name + "." + str(column_value) for column_value in factor_values]
df_new = df.copy()
for index, factor_value in enumerate(factor_values):
factor_index = df_new[self.column_name].map(str).isin([str(factor_value)])
column = factor_names[index]
df_new[column] = factor_index.astype(int)
return df_new