# json_hdf5.py
# JSON <-> HDF5 converter (dict/list/str/int/float/bool/null)
# Requires: h5py, numpy
import json
import h5py
import numpy as np
# ------------------------ Name encoding (reversible) ------------------------
_SAFE_CHARS = set(b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._-")
def _percent_encode_name(name):
"""
Encode an arbitrary JSON key to a valid HDF5 path component using percent-utf8.
Reversible with _percent_decode_name.
"""
if not isinstance(name, str):
name = str(name)
b = name.encode("utf-8", errors="strict")
out = []
for ch in b:
if ch in _SAFE_CHARS:
out.append(chr(ch))
else:
out.append("%%%02X" % ch)
return "".join(out)
def _percent_decode_name(encoded):
"""
Reverse of _percent_encode_name.
"""
out = bytearray()
i = 0
s = encoded
while i < len(s):
if s[i] == "%" and i + 2 < len(s):
out.append(int(s[i+1:i+3], 16))
i += 3
else:
out.append(ord(s[i]))
i += 1
return out.decode("utf-8")
# ------------------------ Writers ------------------------
def _write_json_node(h, name, obj):
"""
Write a JSON value under group h with child name.
Returns the created object (Group or Dataset).
"""
if isinstance(obj, dict):
g = h.create_group(name)
g.attrs["kind"] = np.string_("dict")
# Preserve key order (Python 3.7+ dicts are ordered)
for k, v in obj.items():
child = _percent_encode_name(k)
_write_json_node(g, child, v)
return g
if isinstance(obj, list):
g = h.create_group(name)
g.attrs["kind"] = np.string_("list")
g.attrs["length"] = np.int64(len(obj))
for i, v in enumerate(obj):
_write_json_node(g, str(i), v)
return g
# Scalars
if obj is None:
g = h.create_group(name)
g.attrs["kind"] = np.string_("none")
return g
if isinstance(obj, bool):
d = h.create_dataset(name, data=np.bool_(obj))
d.attrs["kind"] = np.string_("bool")
return d
if isinstance(obj, int) and not isinstance(obj, bool):
d = h.create_dataset(name, data=np.int64(obj))
d.attrs["kind"] = np.string_("int")
return d
if isinstance(obj, float):
d = h.create_dataset(name, data=np.float64(obj))
d.attrs["kind"] = np.string_("float")
return d
if isinstance(obj, str):
dt = h5py.string_dtype(encoding="utf-8")
d = h.create_dataset(name, data=np.array(obj, dtype=dt))
d.attrs["kind"] = np.string_("str")
return d
# Fallback: store as JSON string
dt = h5py.string_dtype(encoding="utf-8")
d = h.create_dataset(name, data=np.array(json.dumps(obj), dtype=dt))
d.attrs["kind"] = np.string_("json_blob")
return d
[docs]
def json_to_hdf5(json_obj, h5_path, root="/"):
"""
Write a JSON-serializable object to HDF5 file at h5_path.
"""
with h5py.File(h5_path, "w") as f:
f.attrs["format"] = np.string_("json-hdf5")
f.attrs["name_encoding"] = np.string_("percent-utf8")
_write_json_node(f, root.strip("/"), json_obj)
[docs]
def json_file_to_hdf5(json_path, h5_path, root="/"):
with open(json_path, "r", encoding="utf-8") as fp:
obj = json.load(fp)
json_to_hdf5(obj, h5_path, root=root)
# ------------------------ Readers ------------------------
def _read_json_node(hobj):
"""
Read a JSON value from an HDF5 object (Group or Dataset).
"""
kind = None
if "kind" in hobj.attrs:
v = hobj.attrs["kind"]
if isinstance(v, bytes):
kind = v.decode("utf-8")
elif isinstance(v, np.ndarray) and v.dtype.kind == "S":
kind = v.astype(str)
else:
kind = str(v)
else:
# Heuristic: datasets without kind -> try best-effort
if isinstance(hobj, h5py.Dataset):
data = hobj[()]
if isinstance(data, (bytes, np.bytes_)):
try:
return data.decode("utf-8")
except UnicodeDecodeError:
return data.decode("utf-8", errors="replace")
if isinstance(data, np.ndarray) and data.shape == ():
return data.item()
return data
# Groups without kind: assume dict
kind = "dict"
if isinstance(hobj, h5py.Dataset):
if kind == "str":
val = hobj.asstr()[()]
return val
if kind == "bool":
return bool(hobj[()].item())
if kind == "int":
return int(hobj[()].item())
if kind == "float":
return float(hobj[()].item())
if kind == "json_blob":
s = hobj.asstr()[()]
return json.loads(s)
# Fallback
data = hobj[()]
if isinstance(data, np.ndarray) and data.shape == ():
return data.item()
return data
# Group kinds
if kind == "none":
return None
if kind == "list":
# Reconstruct in order 0..length-1 if available, else numeric sort
length = int(hobj.attrs["length"]) if "length" in hobj.attrs else None
if length is not None:
out = []
for i in range(length):
child = hobj[str(i)]
out.append(_read_json_node(child))
return out
# Fallback: numeric sort of keys
items = []
for k in hobj.keys():
try:
idx = int(k)
except ValueError:
idx = None
items.append((idx, k))
items.sort(key=lambda t: (t[0] is None, t[0], t[1]))
return [_read_json_node(hobj[k]) for _, k in items]
if kind == "dict":
d = {}
for k in hobj.keys():
decoded = _percent_decode_name(k)
d[decoded] = _read_json_node(hobj[k])
return d
# Unknown kind: attempt best-effort
return {k: _read_json_node(hobj[k]) for k in hobj.keys()}
[docs]
def hdf5_to_json(h5_path, root="/"):
"""
Read HDF5 file and return a JSON-serializable Python object from the given root node.
"""
with h5py.File(h5_path, "r") as f:
node_name = root.strip("/")
if node_name in f:
return _read_json_node(f[node_name])
# If root is top-level and not present, accept the only top-level object
keys = list(f.keys())
if len(keys) == 1:
return _read_json_node(f[keys[0]])
raise KeyError("Root node not found. Available: %r" % keys)
[docs]
def hdf5_file_to_json(h5_path, json_path, root="/", pretty=True):
obj = hdf5_to_json(h5_path, root=root)
with open(json_path, "w", encoding="utf-8") as fp:
if pretty:
json.dump(obj, fp, ensure_ascii=False, indent=2, sort_keys=False)
else:
json.dump(obj, fp, ensure_ascii=False, separators=(",", ":"))