Source code for hed.tools.analysis.column_name_summary
"""Summarize the unique column names in a dataset."""
import json
[docs]
class ColumnNameSummary:
"""Summarize the unique column names in a dataset."""
def __init__(self, name=""):
self.name = name
self.file_dict = {}
self.unique_headers = []
[docs]
def update(self, name, columns):
"""Update the summary based on columns associated with a file.
Parameters:
name (str): File name associated with the columns.
columns (list): List of file names.
"""
position = self.update_headers(columns)
if name not in self.file_dict:
self.file_dict[name] = position
elif name in self.file_dict and position != self.file_dict[name]:
raise ValueError(
"FileHasChangedColumnNames",
f"{name}: Summary has conflicting column names "
+ f"Current: {str(columns)} Previous: {str(self.unique_headers[self.file_dict[name]])}",
)
[docs]
def update_headers(self, column_names):
"""Update the unique combinations of column names.
Parameters:
column_names (list): List of column names to update.
"""
for index, item in enumerate(self.unique_headers):
if item == column_names:
return index
self.unique_headers.append(column_names)
return len(self.unique_headers) - 1
[docs]
def get_summary(self, as_json=False):
"""Return summary as an object or in JSON.
Parameters:
as_json (bool): If False (the default), return the underlying summary object, otherwise transform to JSON.
"""
patterns = [[] for _ in self.unique_headers]
for key, value in self.file_dict.items():
patterns[value].append(key)
column_headers = []
for index in range(len(patterns)):
column_headers.append({"Column names": self.unique_headers[index], "Files": patterns[index]})
summary = {"Summary name": self.name, "Columns": column_headers, "Number files": len(self.file_dict)}
if as_json:
return json.dumps(summary, indent=4)
else:
return summary