from __future__ import annotations
from typing import Any
import numpy as np
import pandas
import pyarrow.parquet as pq
[docs]
class TableColumnInfo:
"""Helper class to manage a column in a table
This provides a mechanism to document the
column in the class docstring, and
to validate input data
"""
def __init__(self, dtype: type, msg: str):
self.dtype = dtype
self.msg = msg
def __repr__(self) -> str:
# return f"{self.dtype:10}\n {self.msg}"
return f"{self.dtype.__name__:8} | {self.msg:50}"
[docs]
def validate(self, val: np.ndarray) -> None:
"""Validate data used to fill a column is of the correct type"""
assert isinstance(val, np.ndarray)
assert val.dtype == self.dtype
[docs]
class TableInterface:
"""Table Schema"""
_schema: dict[str, TableColumnInfo] = dict()
def __init__(self, df: pandas.DataFrame | None = None, **kwargs: Any):
if df is None:
self._data = self.toPandas(**kwargs)
else:
self._data = df
@classmethod
def _describeSchema(cls) -> str:
"""Describe the columns in this table"""
s = []
for name, val in cls._schema.items():
assert isinstance(val, TableColumnInfo)
s.append(f"| {name:15} | {val} |")
return "\n+-----------------+----------+----------------------------------------------------+\n".join(
s
)
def __init_subclass__(cls, **kwargs: Any) -> None:
config_text = cls._describeSchema()
if cls.__doc__ is None: # pragma: no cover
cls.__doc__ = f"\nNotes\n-----\n{cls.__name__} schema\n\n"
cls.__doc__ += "+-----------------+----------+----------------------------------------------------+\n"
cls.__doc__ += "| Column | Type | Description |\n"
cls.__doc__ += "+=================+==========+====================================================+\n"
cls.__doc__ += config_text
cls.__doc__ += "\n+-----------------+----------+----------------------------------------------------+\n"
else:
# strip any existing configuration text from parent classes that is at the end of the doctring
cls.__doc__ = cls.__doc__.split("Notes")[0]
cls.__doc__ += f"\nNotes\n-----\n{cls.__name__} schema\n\n"
cls.__doc__ += "+-----------------+----------+----------------------------------------------------+\n"
cls.__doc__ += "| Column | Type | Description |\n"
cls.__doc__ += "+=================+==========+====================================================+\n"
cls.__doc__ += config_text
cls.__doc__ += "\n+-----------------+----------+----------------------------------------------------+\n"
@property
def data(self) -> pandas.DataFrame:
"""Return the underlying data"""
return self._data
[docs]
@classmethod
def validate(cls, **kwargs: Any) -> None:
"""Validate that data match the schema
Parameters
----------
kwargs:
The input data
Raises
------
ValueError:
The number of columns don't match the schema
KeyError:
An input column is not in the schema
"""
table_size: int = -1
if len(kwargs) != len(cls._schema): # pragma: no cover
raise ValueError(f"{len(kwargs)} != {len(cls._schema)}")
for key, val in kwargs.items():
if key not in cls._schema: # pragma: no cover
raise KeyError(f"{key} not in {list(cls._schema.keys())}")
col_info = cls._schema[key]
col_info.validate(val)
if table_size < 0:
table_size = val.size
else:
assert val.size == table_size
[docs]
@classmethod
def read(cls, file_path: str, extra_cols: list[str]) -> pandas.DataFrame:
"""Read a dataframe from a file"""
read_list = list(cls._schema.keys())
read_list += extra_cols
parq = pq.read_pandas(file_path, columns=read_list)
df = parq.to_pandas()
return df
[docs]
@classmethod
def toPandas(cls, **kwargs: Any) -> pandas.DataFrame:
"""Convert data to a pandas DataFrame
Parameters
----------
kwargs:
The input data
"""
cls.validate(**kwargs)
return pandas.DataFrame(kwargs)
[docs]
@classmethod
def emtpyNumpyDict(cls, n: int) -> dict[str, np.ndarray]:
"""Create a dict of empty numpy arrays
Parameter
---------
n:
Length of the arrays
"""
return {key: np.zeros((n), dtype=val.dtype) for key, val in cls._schema.items()}