Source code for datopy.models.media

"""
Data models, validators, and ETL tools for scraped media data.

Includes support for film reviews (via IMDb), music albums (via Spotify),
and related information (via Wikipedia).

.. note:: WIP.

Overview
~~~~~~~~

.. currentmodule:: datopy.models.media

.. rubric:: Data models

.. autosummary::
    :nosignatures:

    IMDbFilm
    SpotifyAlbum

API
~~~
"""

import re
import sys
import typing
import doctest
import pandas as pd
from typing import (
    List,
    Tuple,
    TypeVar,
    Literal,
    NamedTuple,
)
from pydantic import (
    Field,
    BaseModel,
    TypeAdapter,
    ValidationInfo,
    ValidationError,
    field_validator,
)
from typing_extensions import Annotated

import imdb
import spotipy
import wptools
from imdb import Cinemagoer
from bs4 import BeautifulSoup
from spotipy.oauth2 import SpotifyClientCredentials

# import datopy._settings
from datopy.etl import omit_string_patterns
from datopy.workflow import doctest_function
from datopy._examples import find_project_root
from datopy.modeling import BaseProcessor, CustomTypes
from datopy.util._numpydoc_validate import numpydoc_validate_module


# -- Metadata retrieval ------------------------------------------------------


# Custom type containing search terms with required 'title' attribute
# TODO: archive this: does not generalize well
# class MediaSearchTerms(NamedTuple):
#     title: str
#     creator: str | None = None

# Film = NamedTuple('Film', [('title', str), ('artist', None)])
# Album = NamedTuple('Album', [('title', str), ('artist', str)])
# Book = NamedTuple('Book', [('title', str), ('artist', str | None)])


[docs] class MediaQuery(NamedTuple): """ Query object types for media metadata retrieval. .. todo:: test """ title: str artist: str | None = None
Film = type('Film', (MediaQuery,), {}) Album = type('Album', (MediaQuery,), {}) Book = type('Book', (MediaQuery,), {}) # class MediaQuery: # """Query object types for media metadata retrieval.""" # Film = Film # Album = Album # Book = Book
[docs] class IMDbFilm(BaseModel): r""" Data model for processed imdb metadata. Examples -------- >>> from pydantic import ValidationError >>> from datopy.models.media import IMDbFilm >>> from datopy._examples import imdb_film_retrieve Valid film >>> valid_film = IMDbFilm( ... title='name 10!', imdb_id='tt1234567', kind='movie', ... year=1990, rating=7.2, votes=122, ... genres='romantic comedy, thriller', cast='mrs smith,mr smith', ... plot='alas! once upon a time, ...', ... budget_mil=1123929) Invalid film >>> invalid_film = dict( ... title='name', imdb_id='tt12', year=1975, votes=-2, rating=5.0) >>> try: ... IMDbFilm(**invalid_film) ... except ValidationError as e: ... print(e) # use pprint.pp(e.errors()) for easy-to-read list 3 validation errors for IMDbFilm imdb_id String should match pattern '^tt.*\d{7}$' [type=string_pattern_mismatch, input_value='tt12', input_type=str] For further information visit https://errors.pydantic.dev/2.8/v/string_pattern_mismatch kind Field required [type=missing, input_value={'title': 'name', 'imdb_i...tes': -2, 'rating': 5.0}, input_type=dict] For further information visit https://errors.pydantic.dev/2.8/v/missing votes Input should be greater than or equal to 0 [type=greater_than_equal, input_value=-2, input_type=int] For further information visit https://errors.pydantic.dev/2.8/v/greater_than_equal Survey available fields and types >>> import pprint >>> from datopy.models.media import Film >>> from datopy._examples import imdb_film_retrieve >>> from datopy.modeling import apply_recursive >>> film = imdb_film_retrieve(Film('spirited away')) .. # >>> film.keys() # >>> pprint.pp(apply_recursive(lambda x: type(x).__name__, film), depth=3) """ # Identifiers title: CustomTypes.CSVnumstr imdb_id: str = Field( pattern=r'^tt.*\d{7}$', description="Unique 7-digit IMDb tt identifier" ) kind: CustomTypes.CSVnumstr = Field( examples=['movie', 'tv series'], description="Retrieved from: `type`" ) # Numeric year: int = Field(ge=1880, le=3000) rating: float = Field(ge=0, le=10) votes: int = Field(ge=0) runtime_mins: float | None = Field(gt=0, default=None) # String lists genres: CustomTypes.CSVstr | None = Field(default=None) countries: CustomTypes.CSVstr | None = Field(default=None) director: CustomTypes.CSVstr | None = Field(default=None) writer: CustomTypes.CSVstr | None = Field(default=None) composer: CustomTypes.CSVstr | None = Field(default=None) cast: CustomTypes.CSVstr | None = Field(default=None) # Strings plot: CustomTypes.CSVnumsent | None = Field(default=None) synopsis: CustomTypes.CSVnumsent | None = Field(default=None) plot_outline: CustomTypes.CSVnumsent | None = Field(default=None) # Financial budget_mil: float | None = Field( ge=0, default=None, description="Strip $/, & text after first space" ) opening_weekend_gross_mil: float | None = Field(ge=0, default=None) cumulative_worldwide_gross_mil: float | None = Field(ge=0, default=None)
[docs] @field_validator('imdb_id', 'kind') @classmethod def check_alphanumeric(cls, v: str, info: ValidationInfo) -> str: if isinstance(v, str): # info.field_name is the name of the field being validated is_alphanumeric = v.replace(' ', '').isalnum() assert is_alphanumeric, f'{info.field_name} must be alphanumeric' # return v.title() return v
# TODO: place media/animals/nations models/queries/processors in # {media/eco/global}_pulse.py # TODO: implement 4 pydantic processed data models + 1 valid/invalid demo
[docs] class SpotifyAlbum(BaseModel): """ Data model for processed Spotify metadata. Raw data schema reference: 'datopy/output/spotify_album_schema.json'. """ # fields of interest: title: str album_type: str
[docs] class WikiBook(BaseModel): """ Data model for processed Wikipedia novel metadata. Raw data schema reference: 'output/wiki_book_schema.json'. """ # fields of interest: title: str
[docs] class WikiFilm(BaseModel): """ Data model for processed Wikipedia film metadata. Raw data schema reference: 'datopy/output/wiki_film_schema.json'. """ # fields of interest: title: str
[docs] class WikiAlbum(BaseModel): """ Data model for processed Wikipedia album metadata. Raw data schema reference: 'datopy/output/wiki_album_schema.json'. """ # fields of interest: title: str
# XXX Scratch tests # valid_obj = {} # invalid_obj = {} # pd.DataFrame(pd.json_normalize(dict(valid_obj))) # pd.DataFrame(pd.json_normalize(dict(invalid_obj))) # try: IMDbFilm(**valid_obj) # except ValidationError as e: pprint.pp(e.errors()) # try: IMDbFilm(**invalid_obj) # except ValidationError as e: pprint.pp(e.errors()) # -- Metadata retrieval ------------------------------------------------------ # Subclass Processor # TODO: move these to media # TODO: implement 5 subclasses
[docs] class IMDbFilmProcessor(BaseProcessor): """ _summary_. """
[docs] def retrieve(self): # title = self.query.title # Retrieval routine obj = [] self.obj = obj return self
[docs] def process(self): # Processing routine data = [] self.data = data return self
# XXX Scratch tests # IMDbFilm = []; Film = [] # film = IMDbFilmProcessor(model=IMDbFilm, query=Film) # film.retrieve().process().to_df() if __name__ == "__main__": # Comment out (2) to run all tests in script; (1) to run specific tests # doctest.testmod(verbose=True) # doctest_function(IMDbFilm, globs=globals(), verbose=False) skip = ["Album", "Film", "Book", "MediaQuery"] numpydoc_validate_module(sys.modules['__main__'], excluded_objects=skip) # One-off tests pass