"""Common utility functions for analysis."""
import json
import logging
from pathlib import Path
from typing import Any, Mapping
import h5py
import numpy as np
logger = logging.getLogger(__name__)
[docs]
class NumpyEncoder(json.JSONEncoder):
"""
Custom JSON encoder for serializing NumPy ndarray and scalar objects.
This encoder converts NumPy arrays to native Python lists and NumPy scalar
types (e.g., float32, int64) to their native Python equivalents so they can be
serialized by the standard `json` module. It can be used with `json.dump`
or `json.dumps` by passing it as the `cls` argument.
Example:
json.dump(data, file, cls=NumpyEncoder)
"""
[docs]
def default(self, obj):
"""
Override default method to convert NumPy arrays and scalar types.
Convert NumPy arrays ad scalar types to JSON-serializable forms.
Parameters:
obj (Any): The object to be serialized.
Returns:
A JSON-serializable version of the object. If the object is a NumPy
ndarray, it is converted to a list. If the object is a NumPy scalar
(e.g., np.float32, np.int64), it is converted to the equivalent Python
scalar. Otherwise, the superclass's default method is used.
Raises:
TypeError: If the object cannot be serialized by the superclass.
"""
if isinstance(obj, np.ndarray):
return obj.tolist()
if isinstance(obj, (np.floating, np.float32, np.float64)):
return float(obj)
if isinstance(obj, (np.integer, np.int32, np.int64)):
return int(obj)
if callable(obj):
return f"<function {obj.__name__}>"
return super().default(obj)
[docs]
def safe_json_dumps(obj):
"""Serialize an object to JSON safely, falling back to str() on failure."""
try:
return json.dumps(obj, cls=NumpyEncoder)
except TypeError as e:
# fallback for anything not serializable, like functions
print(f"Fallback triggered for object: {obj!r} with error {e}")
return str(obj)
[docs]
def export_to_hdf5(
record: Mapping[str, Any], out_path: Path, dataset_name: str = "analysis_record"
) -> None:
"""
Save a nested dict/list/array structure to HDF5 with full breakdown.
- Dicts become groups.
- Lists of primitives become datasets.
- Lists of complex objects become subgroups item_0, item_1, etc.
- NumPy arrays become datasets.
"""
out_path.parent.mkdir(parents=True, exist_ok=True)
with h5py.File(out_path, "w") as h5file:
def recurse(group, obj):
if isinstance(obj, dict):
for key, value in obj.items():
recurse(group.create_group(str(key)), value)
elif isinstance(obj, list):
if len(obj) == 0:
group.create_dataset("empty", data=[])
elif all(isinstance(item, (int, float, np.number)) for item in obj):
group.create_dataset("values", data=np.array(obj, dtype=float))
elif all(isinstance(item, (str, bytes)) for item in obj):
dt = h5py.string_dtype(encoding="utf-8")
group.create_dataset("values", data=np.array(obj, dtype=dt))
else:
for idx, item in enumerate(obj):
recurse(group.create_group(f"item_{idx}"), item)
elif isinstance(obj, np.ndarray):
try:
group.create_dataset("values", data=obj)
except TypeError:
dt = h5py.string_dtype(encoding="utf-8")
group.create_dataset("values", data=obj.astype(str), dtype=dt)
else:
try:
group.attrs["value"] = obj
except TypeError:
group.attrs["value"] = str(obj)
recurse(h5file.create_group(dataset_name), record)
[docs]
def load_analysis_from_hdf5(
file_path: str | Path, dataset_name: str = "analysis_record"
) -> dict:
"""
Load a nested dict/list/NumPy array structure from an HDF5 file.
This exactly reverses `export_to_hdf5`. Automatically converts integer-valued
NumPy floats to Python ints recursively for use as list indices.
Parameters
----------
file_path : str or Path
Path to the HDF5 file containing the saved analysis dictionary.
dataset_name : str
Name of the top-level group in the HDF5 file where the dictionary is stored.
Default is 'analysis_record'.
Returns
-------
record : dict
Nested dictionary reconstructed from the HDF5 file. The structure preserves:
- dicts as dicts
- lists as lists
- NumPy arrays as np.ndarray
- strings as str
- empty lists as []
- primitive values stored in attributes
- integer-valued floats converted to Python int recursively
Raises
------
KeyError
If `dataset_name` is not found in the HDF5 file.
"""
def recurse(group):
"""Recursively reconstruct dict/list/array from an HDF5 group."""
if "values" in group:
data = group["values"][()]
# Convert bytes to str if needed
if isinstance(data, np.ndarray) and data.dtype.kind == "S":
return data.astype(str)
# Scalar array
if isinstance(data, np.ndarray) and data.shape == ():
val = data.item()
return (
int(val)
if isinstance(val, (np.integer, np.floating))
and float(val).is_integer()
else val
)
# Full array: convert integer-valued floats to int
if isinstance(data, np.ndarray) and np.issubdtype(data.dtype, np.floating):
if np.all(np.mod(data, 1) == 0):
data = data.astype(int)
return data
if "empty" in group:
return []
if "value" in group.attrs:
val = group.attrs["value"]
return (
int(val)
if isinstance(val, np.floating) and float(val).is_integer()
else val
)
keys = list(group.keys())
if all(k.startswith("item_") for k in keys):
items = [
recurse(group[k])
for k in sorted(keys, key=lambda x: int(x.split("_")[1]))
]
return items
else:
return {k: recurse(group[k]) for k in keys}
with h5py.File(file_path, "r") as h5file:
if dataset_name not in h5file:
raise KeyError(f"Dataset '{dataset_name}' not found in HDF5 file.")
return recurse(h5file[dataset_name])
[docs]
def sanitize_analysis_for_logging(obj, path="root", _depth=0, _max_depth=6):
"""Return a JSON-safe version of any object, printing any functions it finds."""
import numpy as np
if callable(obj):
logger.debug(
f"Removing function at {path}: {getattr(obj, '__name__', 'anonymous')}"
)
return f"<function {getattr(obj, '__name__', 'anonymous')}>"
if _depth > _max_depth:
return f"<... truncated depth {_depth} ...>"
if isinstance(obj, dict):
return {
k: sanitize_analysis_for_logging(
v, path=f"{path}.{k}", _depth=_depth + 1, _max_depth=_max_depth
)
for k, v in obj.items()
}
if isinstance(obj, (list, tuple, set)):
return [
sanitize_analysis_for_logging(
v, path=f"{path}[{i}]", _depth=_depth + 1, _max_depth=_max_depth
)
for i, v in enumerate(obj)
]
if isinstance(obj, np.ndarray):
return {
"_array_shape": obj.shape,
"_array_dtype": str(obj.dtype),
"_summary": {
"min": float(np.min(obj)) if obj.size else None,
"max": float(np.max(obj)) if obj.size else None,
"mean": float(np.mean(obj)) if obj.size else None,
},
}
if isinstance(obj, bytes):
return obj.decode("utf-8", errors="replace")
if isinstance(obj, (np.generic, np.bool_)):
return obj.item()
return obj
[docs]
def make_json_safe(record: dict) -> dict:
"""
Make a AnalysisRecord safe for export through JSON dumping.
Prepare an AnalysisRecord for JSON dumping by stripping out or summarizing
any non-JSON-serializable entries.
Parameters
----------
record : dict
The full AnalysisRecord returned by AnalysisPipeline.run().
Returns
-------
dict
A new dict with the same top-level keys ("environment", "analysis",
"provenance"), but with each value run through your sanitizers so
that they contain only numbers, strings, lists, and dicts.
"""
return {
"environment": sanitize_analysis_for_logging(record["environment"]),
"analysis": sanitize_analysis_for_logging(record["analysis"]),
"provenance": sanitize_analysis_for_logging(record["provenance"]),
}