Source code for datopy.modeling

"""
Tools for data modeling, validation, and raw data processing.

.. note:: WIP.

Overview
~~~~~~~~

.. currentmodule:: datopy.modeling

.. rubric:: Auto-generated data models

Tools for automated generation of data models from data.

.. autosummary::
    :nosignatures:

    list_to_dict
    compare_dict_keys
    apply_recursive
    schema_jsonify

.. rubric:: A flexible framework for ETL workflows

.. autosummary::
    :nosignatures:

    BaseProcessor

API
~~~
"""


# Section headers references:
# https://docs.scipy.org/doc/scipy/reference/fft.html
# https://github.com/scipy/scipy/blob/main/scipy/fft/__init__.py
#
# https://matplotlib.org/3.9.0/api/axes_api.html
# https://github.com/matplotlib/matplotlib/blob/main/doc/api/axes_api.rst
#
# https://matplotlib.org/3.9.0/api/dates_api.html#matplotlib.dates
# https://github.com/matplotlib/matplotlib/blob/v3.9.0/lib/matplotlib/dates.py


import sys
import json
import pprint
import doctest
import pandas as pd
from jsonschema import validate
import typing
from pydantic import (
    Field,
    BaseModel,
    PositiveInt,
    ValidationError,
    field_validator,
)
from typing import (
    Any,
    List,
    Callable,
    Iterable,
    Collection,
    NamedTuple,
    TYPE_CHECKING
)
from typing_extensions import Annotated, TypeAliasType

# import datopy._settings
from datopy.workflow import doctest_function
from datopy.util._numpydoc_validate import numpydoc_validate_module

# Custom types
# (recursively) nested dict with arbitrary depth and pre-defined node type
# TODO: check this!
NestedDict = dict[str, "NestedDict" | List[str] | None]
GenericNestedDict = dict[object, object]


# -- Data dictionary generation utils ----------------------------------------


[docs] def list_to_dict( obj: list[object] | tuple[object] | set[object], max_items: int | None = None ) -> dict[int, object]: """ Provide a dictionary representation of a list, using indices as keys. Also compatible with other non-dictionary or string-like iterables. Parameters ---------- obj : list A list to convert to a dictionary representation. max_items : int, default=None Option to impose a limit on the number of elements to iterate over. Intended use: constructing pattern-based data models from a sample. Returns ------- dict The supplied list's dictionary representation. Examples -------- >>> from datopy.modeling import list_to_dict >>> my_list = [1, 'two', [3], {'four': 5}] >>> list_to_dict(my_list) {1: 1, 2: 'two', 3: [3], 4: {'four': 5}} >>> my_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] >>> list_to_dict(my_list, max_items=5) {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} >>> my_dict = dict(a=1, b='two') >>> list_to_dict(my_dict) Not running conversion since obj is already a dictionary. {'a': 1, 'b': 'two'} """ if isinstance(obj, dict): print("Not running conversion since obj", # type: ignore [unreachable] "is already a dictionary.") return obj else: return {(key + 1): value for key, value in enumerate(obj) if (max_items is None) or (key < max_items)}
[docs] def compare_dict_keys( dict1: GenericNestedDict | object, dict2: GenericNestedDict | object ) -> GenericNestedDict | str | None: """ Compare two dictionaries recursively and identify missing keys. Parameters ---------- dict1 : dict The reference dictionary. dict2 : dict The comparison dictionary to be checked against ``dict1``. Returns ------- dict | list[str] | None The nested dictionary of fields missing from ``dict2`` relative to ``dict1``. Examples -------- Setup >>> from datopy.modeling import compare_dict_keys >>> import copy >>> dict1 = {'a1': 1, 'a2': 'two', 'a3': [3], ... 'b1': {'b11': 1, 'b12': 'two', 'b13': [3]}, ... 'c1': {'c11': {'c111': 1, 'c112': 'two', 'c113': [3]}} ... } >>> from datopy.modeling import compare_dict_keys Identical dictionaries >>> dict2 = copy.deepcopy(dict1) >>> compare_dict_keys(dict1, dict2) Missing nesting level 0 key >>> del dict2['a1'] >>> compare_dict_keys(dict1, dict2) {'missing_keys': ['a1']} Missing nesting level 1 key >>> dict2 = copy.deepcopy(dict1) >>> del dict2['b1']['b12'] >>> compare_dict_keys(dict1, dict2) {'nested_diff': {'b1': {'missing_keys': ['b12']}}} Missing nesting level 2 key >>> dict2 = copy.deepcopy(dict1) >>> del dict2['c1']['c11']['c113'] >>> compare_dict_keys(dict1, dict2) {'nested_diff': {'c1': {'nested_diff': {'c11': {'missing_keys': ['c113']}}}}} """ if isinstance(dict1, dict) and not isinstance(dict2, dict): return "missing nested dictionary" if not (isinstance(dict1, dict) and isinstance(dict2, dict)): return None missing_keys = set(dict1.keys()) - set(dict2.keys()) shared_keys = set(dict1.keys()).intersection(set(dict2.keys())) # Initialize difference dictionary diff_dict: dict[object, object] = {} for key in shared_keys: nested_diff = compare_dict_keys(dict1[key], dict2[key]) # Add any differences to the difference if nested_diff is not None: diff_dict[key] = nested_diff # Return result if no missing keys or no diffs in nested dicts found if missing_keys or diff_dict: result: dict[object, object] = {} if missing_keys: result['missing_keys'] = list(missing_keys) if diff_dict: result['nested_diff'] = diff_dict return result # Return None if no missing keys or differences found return None
[docs] def apply_recursive( func: Callable[..., Any], obj ) -> dict[str | int, Any] | Any: """ Apply ``func`` to each terminal value in a nested data structure. Valid nested data structures include those with explicit or implied key/value pairs. Parameters ---------- func : Callable[..., Any] _description_. obj : _description_. Returns ------- dict: A tree-like dictionary representation of the transformed ``obj``. Examples -------- >>> from datopy.modeling import apply_recursive >>> import pprint Define the data >>> nested_data = { ... 'type': 'album', 'url': 'link.com', 'audio_features': [ ... {'loudness': -11.4, 'duration_ms': 251}, ... {'loudness': -15.5, 'duration_ms': 284} ... ] ... } >>> pprint.pp(nested_data) {'type': 'album', 'url': 'link.com', 'audio_features': [{'loudness': -11.4, 'duration_ms': 251}, {'loudness': -15.5, 'duration_ms': 284}]} Convert to json-friendly representation >>> serialized = apply_recursive(str, nested_data) >>> pprint.pp(serialized) {'type': 'album', 'url': 'link.com', 'audio_features': {1: {'loudness': '-11.4', 'duration_ms': '251'}, 2: {'loudness': '-15.5', 'duration_ms': '284'}}} Convert to field/type pairs >>> schema = apply_recursive(lambda x: type(x).__name__, nested_data) >>> pprint.pp(schema) {'type': 'str', 'url': 'str', 'audio_features': {1: {'loudness': 'float', 'duration_ms': 'int'}, 2: {'loudness': 'float', 'duration_ms': 'int'}}} """ # Handle dictionary-like objects if hasattr(obj, 'items'): return {key: apply_recursive(func, value) for key, value in obj.items()} # Handle list-like objects elif isinstance(obj, (list, tuple, set)): return {key: apply_recursive(func, value) for key, value in list_to_dict(obj, max_items=5).items()} # Handle base cases elif isinstance(obj, str): return func(obj) else: return func(obj)
[docs] def schema_jsonify(obj: GenericNestedDict) -> GenericNestedDict: r""" _summary_. Parameters ---------- obj : dict _description_. Returns ------- dict _description_. Examples -------- >>> import pprint >>> from datopy.modeling import schema_jsonify >>> original_schema = { ... 'name': 'str', 'quantity': 'int', ... 'features': { ... 1: {'volume': 'str', 'duration': 'float'}, ... 2: {'volume': 'str', 'duration': 'float'} ... }, ... 'creator': {'person': {'name': 'str'}, ... 'company': {'name': 'str', 'location': 'str'}} ... } >>> schema = schema_jsonify(original_schema) >>> schema = {**{"title": "title", "description": "description"}, **schema} >>> pprint.pp(schema, compact=True, depth=3) {'title': 'title', 'description': 'description', 'type': 'object', 'properties': {'name': {'type': 'string'}, 'quantity': {'type': 'number'}, 'features': {'type': 'array', 'minItems': 1, 'maxItems': 2, 'uniqueItems': True, 'items': {...}}, 'creator': {'type': 'object', 'properties': {...}, 'required': [...]}}, 'required': ['name', 'quantity', 'features', 'creator']} """ schema: GenericNestedDict = {} is_dict = isinstance(obj, dict) # Case 1 (array-like) if obj and is_dict and isinstance(list(obj.keys())[0], int): field_len = list(obj.keys())[-1] schema = { "type": 'array', # coerced to object; includes tuple/list "minItems": 1, "maxItems": field_len, "uniqueItems": True } # Recurse on first item, assuming homogeneity for simplicity schema["items"] = schema_jsonify(obj[1]) # type: ignore [arg-type] return schema # Case 2 (dictionary) elif obj and is_dict: schema["type"] = "object" schema["properties"] = {} # Require all by default to easily edit later schema["required"] = list(obj.keys()) for key, val in obj.items(): # Recurse on each value schema["properties"][key] = schema_jsonify(val) # type: ignore [index, arg-type] return schema # Base cases (non-container types) elif obj == "str": # type: ignore [comparison-overlap] schema["type"] = "string" return schema elif obj in ("int", "float"): # type: ignore [comparison-overlap] schema["type"] = "number" return schema else: schema["type"] = "null" return schema
# -- Data processing base types and class ------------------------------------ # TODO: replace these with field validators -- either general or model-specific
[docs] class CustomTypes: """ Define reusable custom field types. Notes ----- Whitespace around commas should be stripped before analysis. For additional info on Pydantic custom types, see: https://docs.pydantic.dev/latest/concepts/types/. """ CSVstr = Annotated[str, Field( pattern=r'^[a-z, ]+$', description=":attr:`~datopy.modeling.CustomTypes` : ``CSVstr``") ] """Lowercase comma-separated string. Excludes numerics and special characters. """ CSVnumstr = Annotated[str, Field( pattern=r'^[a-z0-9,.! ]+$', description=":attr:`~datopy.modeling.CustomTypes` : ``CSVnumstr``"), ] """Lowercase comma-separated string. Allows numerics; excludes special characters. """ CSVnumsent = Annotated[str, Field( pattern=r'^[a-z0-9,.! ]+$', description=":attr:`~datopy.modeling.CustomTypes` : ``CSVnumsent``"), ]
# TODO: implement BaseProcessor
[docs] class BaseProcessor: """ The fundamental data processing structure. Parameters ---------- model : BaseModel _description_. query : NamedTuple _description_. """ def __init__(self, model: BaseModel, query: NamedTuple): self.query = query self.model = model
[docs] def retrieve(self): """ Extract data for the query from the API of the supplied model. Raises ------ NotImplementedError _description_. """ ### Retrieval routine goes here ### raise NotImplementedError
# include return here? self assignment?
[docs] def process(self): """ Prepare (extract/clean) the retrieved data. Raises ------ NotImplementedError _description_. """ ### Processing routine goes here ### # TODO: raise NotRetrieved error (try model.obj) raise NotImplementedError
def _validate(self): """ Validate the processed data against the supplied model. Raises ------ ValidationError: _description_ """ model = self.model model # try: # model(**self.data) # except ValidationError as e: # pprint.pp(e.errors()) print("Validated") return None
[docs] def to_df(self) -> pd.DataFrame: """ Load the data into a dataframe for further processing or analysis. Returns ------- pd.DataFrame The processed entry as a data frame. """ # Validate before loading self._validate() df = pd.DataFrame([self.data]) # type: ignore [attr-defined] return df
if __name__ == "__main__": # Comment out line 2 to run all tests; line 1 to run specific tests. # doctest.testmod(verbose=True) # doctest_function(object=BaseProcessor, globs=globals()) # Docstring validation numpydoc_validate_module(sys.modules['__main__']) # Type checks that the compiler does not see or understand obj_to_check = {1: "one", 2: "two"} if TYPE_CHECKING: # reveal_type(obj_to_check) pass