#!/usr/bin/env python3
# xml_readers.py
#
# Stand-alone XML loaders:
# - read_xml(filename, keep_root=False, decode_special=True)
# - load_xml(filename, keep_root=False, decode_special=True) # alias of read_xml
#
# Behavior
# - Parses ANY XML file into Python dict/list/scalars.
# - Attributes -> stored under "@attrs"; element text -> under "#text".
# - Repeated child tags become Python lists.
# - If decode_special=True, restores data encoded with markers:
# * @attrs.type="ndarray" with dtype + shape + <item> -> numpy.ndarray
# * @attrs.type="datetime" with #text ISO string -> datetime.datetime
# * @attrs.type="astropy_time" with 'scale' + #text -> astropy.time.Time (if available)
# * @attrs.type="set"/"tuple" with <item> children -> set / tuple
#
# Notes
# - Uses numpy (no math, no typing).
# - No CLI / __main__ block; import and call read_xml/load_xml directly.
import numpy as np
from datetime import datetime
try:
from astropy.time import Time # optional
except Exception:
Time = None # gracefully degrade if astropy isn't installed
import xml.etree.ElementTree as ET
def _element_to_struct(element):
"""
Convert an Element into a nested, JSON-like structure:
- attributes under "@attrs"
- text under "#text"
- children grouped by tag; repeated tags become lists
- pure-text nodes become just the text scalar
"""
node = {}
if element.attrib:
node["@attrs"] = dict(element.attrib)
# Group children by tag to detect repeats
children_by_tag = {}
for child in element:
child_struct = _element_to_struct(child)
tag = child.tag
children_by_tag.setdefault(tag, []).append(child_struct)
# Attach grouped children (singletons vs lists)
for tag, items in children_by_tag.items():
node[tag] = items if len(items) > 1 else items[0]
# Attach text content
text = (element.text or "").strip()
if text:
if node: # already has attrs or children
node["#text"] = text
else:
return text # pure text node becomes scalar
return node
def _decode_special_struct(struct):
"""
Decode special types when marked via @attrs['type'] on a dict payload.
Returns either a decoded Python object or the original struct.
"""
if not isinstance(struct, dict):
return struct
attrs = struct.get("@attrs")
if not isinstance(attrs, dict):
return struct
encoded_type = attrs.get("type")
if not encoded_type:
return struct
if encoded_type == "ndarray":
dtype = attrs.get("dtype") or "float64"
shape_txt = attrs.get("shape") or ""
shape = tuple(int(s) for s in shape_txt.split(",")) if shape_txt else None
items = struct.get("item", [])
arr = np.array(items, dtype=dtype)
if shape:
try:
arr = arr.reshape(shape)
except Exception:
# If reshape fails, return flat array
pass
return arr
if encoded_type == "datetime":
iso = struct.get("#text", "")
try:
return datetime.fromisoformat(iso)
except Exception:
return iso # leave as string if parse fails
if encoded_type == "astropy_time":
isot = struct.get("#text", "")
scale = attrs.get("scale", "utc")
if Time is not None:
try:
return Time(isot, scale=scale)
except Exception:
return isot
return isot # astropy not available: return text
if encoded_type == "set":
return set(struct.get("item", []))
if encoded_type == "tuple":
return tuple(struct.get("item", []))
return struct
def _struct_to_python(obj, decode_special):
"""Recursively convert the XML-structure to plain Python, applying special decoding."""
if isinstance(obj, list):
return [_struct_to_python(v, decode_special) for v in obj]
if isinstance(obj, dict):
# Recurse into children first
recursed = {k: _struct_to_python(v, decode_special) for k, v in obj.items()}
# Then decode marked special payloads, if requested
return _decode_special_struct(recursed) if decode_special else recursed
return obj # scalars pass through unchanged
[docs]
def read_xml(filename, keep_root=False, decode_special=True):
"""
Parse an XML file into Python data.
Args:
filename: Path-like or string to an XML file.
keep_root (bool): If False (default), return the content of the root element.
If True, return {root_tag: content}.
decode_special (bool): If True (default), restore marked ndarray/datetime/Time/set/tuple.
Returns:
dict | list | str | int | float | bool | numpy.ndarray | datetime | astropy.time.Time
"""
tree = ET.parse(str(filename))
root = tree.getroot()
struct = _element_to_struct(root)
data = _struct_to_python(struct, decode_special=decode_special)
return {root.tag: data} if keep_root else data
[docs]
def load_xml(filename, keep_root=False, decode_special=True):
"""
Alias of read_xml(), provided with the requested name.
"""
return read_xml(filename, keep_root=keep_root, decode_special=decode_special)
[docs]
def save_xml(filename, data, root_tag="root", pretty=True, xml_declaration=True, encoding="utf-8"):
"""
Serialize Python data to XML and write to 'filename'.
- Dict/List/Scalar supported.
- Special encodings:
* numpy.ndarray -> @attrs: type="ndarray", dtype, shape + <item> values
* numpy scalars -> converted to native Python numbers
* datetime -> @attrs: type="datetime", #text=ISO8601
* astropy Time -> @attrs: type="astropy_time", scale + #text=ISOT
* set/tuple -> @attrs: type="set"/"tuple" + <item> children
"""
def _indent_in_place(element, level=0):
indent_space = " "
i = "\n" + level * indent_space
if len(element):
if not element.text or not element.text.strip():
element.text = i + indent_space
for child in element:
_indent_in_place(child, level + 1)
if not child.tail or not child.tail.strip():
child.tail = i
if level and (not element.tail or not element.tail.strip()):
element.tail = i
def _serialize_special(obj):
if isinstance(obj, np.ndarray):
flat = obj.ravel().tolist()
return {
"@attrs": {"type": "ndarray", "dtype": str(obj.dtype), "shape": ",".join(str(x) for x in obj.shape)},
"item": [x.item() if isinstance(x, np.generic) else x for x in flat],
}
if isinstance(obj, (np.integer, np.floating, np.bool_)):
return obj.item()
if isinstance(obj, datetime):
return {"@attrs": {"type": "datetime"}, "#text": obj.isoformat()}
if Time is not None and isinstance(obj, Time):
return {"@attrs": {"type": "astropy_time", "scale": obj.scale}, "#text": obj.isot}
if isinstance(obj, set):
return {"@attrs": {"type": "set"}, "item": list(obj)}
if isinstance(obj, tuple):
return {"@attrs": {"type": "tuple"}, "item": list(obj)}
return None
def _python_to_struct(obj):
special = _serialize_special(obj)
if special is not None:
if isinstance(special, dict):
out = {}
for k, v in special.items():
if k in ("@attrs", "#text"):
out[k] = v
else:
out[k] = _python_to_struct(v)
return out
return special
if isinstance(obj, dict):
return {k: _python_to_struct(v) for k, v in obj.items()}
if isinstance(obj, list):
return [_python_to_struct(v) for v in obj]
if isinstance(obj, (str, bool, int, float)):
return obj
if isinstance(obj, bytes):
return obj.decode("utf-8", "replace")
return str(obj)
def _struct_to_element(tag, payload):
elem = ET.Element(tag)
if not isinstance(payload, (dict, list)):
elem.text = str(payload)
return elem
if isinstance(payload, list):
for item in payload:
elem.append(_struct_to_element("item", item))
return elem
attrs = payload.get("@attrs")
if isinstance(attrs, dict):
for k, v in attrs.items():
elem.set(k, str(v))
text = payload.get("#text")
if isinstance(text, (str, int, float, bool)):
elem.text = str(text)
for key, value in payload.items():
if key in ("@attrs", "#text"):
continue
if isinstance(value, list):
for item in value:
elem.append(_struct_to_element(key, item))
else:
elem.append(_struct_to_element(key, value))
return elem
structure = _python_to_struct(data)
if isinstance(structure, dict) and len(structure) == 1 and next(iter(structure)) not in ("@attrs", "#text"):
only_key = next(iter(structure))
root_element = _struct_to_element(only_key, structure[only_key])
else:
root_element = _struct_to_element(root_tag, structure)
if pretty:
_indent_in_place(root_element)
ET.ElementTree(root_element).write(filename, encoding=encoding, xml_declaration=xml_declaration)