Source code for hpmcm.table

from __future__ import annotations

from typing import Any

import numpy as np
import pandas
import pyarrow.parquet as pq


[docs] class TableColumnInfo: """Helper class to manage a column in a table This provides a mechanism to document the column in the class docstring, and to validate input data """ def __init__(self, dtype: type, msg: str): self.dtype = dtype self.msg = msg def __repr__(self) -> str: # return f"{self.dtype:10}\n {self.msg}" return f"{self.dtype.__name__:8} | {self.msg:50}"
[docs] def validate(self, val: np.ndarray) -> None: """Validate data used to fill a column is of the correct type""" assert isinstance(val, np.ndarray) assert val.dtype == self.dtype
[docs] class TableInterface: """Table Schema""" _schema: dict[str, TableColumnInfo] = dict() def __init__(self, df: pandas.DataFrame | None = None, **kwargs: Any): if df is None: self._data = self.toPandas(**kwargs) else: self._data = df @classmethod def _describeSchema(cls) -> str: """Describe the columns in this table""" s = [] for name, val in cls._schema.items(): assert isinstance(val, TableColumnInfo) s.append(f"| {name:15} | {val} |") return "\n+-----------------+----------+----------------------------------------------------+\n".join( s ) def __init_subclass__(cls, **kwargs: Any) -> None: config_text = cls._describeSchema() if cls.__doc__ is None: # pragma: no cover cls.__doc__ = f"\nNotes\n-----\n{cls.__name__} schema\n\n" cls.__doc__ += "+-----------------+----------+----------------------------------------------------+\n" cls.__doc__ += "| Column | Type | Description |\n" cls.__doc__ += "+=================+==========+====================================================+\n" cls.__doc__ += config_text cls.__doc__ += "\n+-----------------+----------+----------------------------------------------------+\n" else: # strip any existing configuration text from parent classes that is at the end of the doctring cls.__doc__ = cls.__doc__.split("Notes")[0] cls.__doc__ += f"\nNotes\n-----\n{cls.__name__} schema\n\n" cls.__doc__ += "+-----------------+----------+----------------------------------------------------+\n" cls.__doc__ += "| Column | Type | Description |\n" cls.__doc__ += "+=================+==========+====================================================+\n" cls.__doc__ += config_text cls.__doc__ += "\n+-----------------+----------+----------------------------------------------------+\n" @property def data(self) -> pandas.DataFrame: """Return the underlying data""" return self._data
[docs] @classmethod def validate(cls, **kwargs: Any) -> None: """Validate that data match the schema Parameters ---------- kwargs: The input data Raises ------ ValueError: The number of columns don't match the schema KeyError: An input column is not in the schema """ table_size: int = -1 if len(kwargs) != len(cls._schema): # pragma: no cover raise ValueError(f"{len(kwargs)} != {len(cls._schema)}") for key, val in kwargs.items(): if key not in cls._schema: # pragma: no cover raise KeyError(f"{key} not in {list(cls._schema.keys())}") col_info = cls._schema[key] col_info.validate(val) if table_size < 0: table_size = val.size else: assert val.size == table_size
[docs] @classmethod def read(cls, file_path: str, extra_cols: list[str]) -> pandas.DataFrame: """Read a dataframe from a file""" read_list = list(cls._schema.keys()) read_list += extra_cols parq = pq.read_pandas(file_path, columns=read_list) df = parq.to_pandas() return df
[docs] @classmethod def toPandas(cls, **kwargs: Any) -> pandas.DataFrame: """Convert data to a pandas DataFrame Parameters ---------- kwargs: The input data """ cls.validate(**kwargs) return pandas.DataFrame(kwargs)
[docs] @classmethod def emtpyNumpyDict(cls, n: int) -> dict[str, np.ndarray]: """Create a dict of empty numpy arrays Parameter --------- n: Length of the arrays """ return {key: np.zeros((n), dtype=val.dtype) for key, val in cls._schema.items()}