Source code for datopy._media_scrape

"""
Data models and retrieval/processing tools for scraping metadata for movies
and movie reviews (via IMDb), music albums (via Spotify), and related
topics (via Wikipedia).
"""

import re
import json
import pprint
import doctest
import requests
import textwrap
import pandas as pd
from typing import List
from jsonschema import validate
from pydantic import BaseModel, ValidationError

import imdb
import wptools
import spotipy
from imdb import Cinemagoer
from bs4 import BeautifulSoup
from spotipy.oauth2 import SpotifyClientCredentials

from datopy.inspection import display
from datopy.workflow import doctest_function


# -----------------
# --- Wikipedia ---
# -----------------
### Get Wiki
# (wiki_extract_film_metadata, wiki_extract_novel_metadata, wiki_extract_album_metadata)

# XXX wiki scratch
# page = "Canada"
# wiki_info = wptools.page(page).get_parse().data['infobox']
# pprint.pp(wiki_info)
# wiki_info['currency']


# ---------------
# --- Spotify ---
# ---------------
### Get Spotify

# ------------
# --- IMdB ---
# ------------

### Get IMdB

[docs] def get_imdb_id(movie_title: str) -> str | None: """ Retrieves the unique IMDb identifier associated with a film or tv show. Parameters ---------- movie_title : str Title of film or tv show (sensitive to spelling but not case). Returns ------- imdb_id : str The unique IMDb tt identifier associated with the show. Examples -------- .. code-block:: python doctest .. doctest:: :skipif: skip_slow >>> from datopy._media_scrape import get_imdb_id >>> movie_title = "the shawshank redemption" >>> tt_id = get_imdb_id(movie_title) >>> tt_id 'tt0111161' >>> movie_title = "ths shukshank redumption" >>> tt_id = get_imdb_id(movie_title) >>> tt_id "No IMDb Identifier found for 'ths shukshank redumption'." """ base_url = "https://www.imdb.com" search_url = f"{base_url}/find?q={movie_title}" headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'} try: search_response = requests.get(search_url, headers=headers) search_response.raise_for_status() except requests.exceptions.HTTPError as err: print(f"HTTP error occurred: {err}") return None soup = BeautifulSoup(search_response.content, 'html.parser') result_links = soup.find_all('a', href=True) for link in result_links: # Return first link containing ttid if '/title/tt' in link['href']: imdb_id = link['href'].split('/title/')[1].split('/')[0] return imdb_id # If no links contain ttid return f"No IMDb Identifier found for '{movie_title}'."
[docs] def get_imdb_reviews(movie_id: str, num_reviews: int = 5) -> List[str] | None: r""" _summary_ Parameters ---------- movie_id : str The unique IMDb tt identifier supplied by `get_imdb_id`. num_reviews : int, default=5 Number of reviews to retrieve. Returns ------- reviews : List[str] _description_ Examples -------- .. code-block:: python doctest .. doctest:: :skipif: skip_slow >>> import textwrap >>> from datopy._media_scrape import get_imdb_reviews, get_imdb_id >>> movie_title = "finding nemo" >>> movie_id = get_imdb_id(movie_title) >>> movie_reviews = get_imdb_reviews(movie_id, num_reviews=2) >>> for i, review in enumerate(movie_reviews, start=1): ... print(f"Review {i}:\n{textwrap.fill(review[:50], 79)} ...\n"); Review 1: I have enjoyed most of the computer-animated films ... <BLANKLINE> Review 2: I'll be totally honest and confirm to you that eve ... <BLANKLINE> """ base_url = f"https://www.imdb.com/title/{movie_id}/reviews" headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'} response = requests.get(base_url, headers=headers) if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') review_containers = soup.find_all('div', class_='text show-more__control') reviews = [] for review_container in review_containers[:num_reviews]: review_text = review_container.text.strip() reviews.append(review_text) return reviews else: print(f"Failed to retrieve reviews. Status code: {response.status_code}") return None
[docs] def get_film_metadata(movie_title: str) -> pd.DataFrame: r""" _summary_ Parameters ---------- movie_title : str Title of a film or tv show (sensitive to spelling but not case). Returns ------- film_df : pd.DataFrame _description_ Examples -------- .. code-block:: python doctest .. doctest:: :skipif: skip_slow Setup >>> from datopy._media_scrape import get_film_metadata >>> title = 'donnie darko' >>> film_df = get_film_metadata(title) .. # >>> film_df.T[0] # title Donnie Darko # imdbID 0246578 # type movie # year 2001 # genres Drama, Mystery, Sci-Fi, Thriller # writers Richard Kelly # countries United States # runtime (min) 113 # directors Richard Kelly # composer Michael Andrews # cast Jake Gyllenhaal, Holmes Osborne, Maggie Gyllen... # rating 8.0 # Votes 847582 # Plot Outline Donnie Darko doesn't get along too well with h... # Plot After narrowly escaping a bizarre accident, a ... # Synopsis Donnie Darko (Jake Gyllenhall) is a troubled t... # Name: 0, dtype: object """ movie_fields = ['title', 'imdbID', 'kind', 'year', 'runtime', 'genres', 'countries', 'directors', 'writers', 'composers', 'cast', 'rating', 'votes', 'plot outline', 'plot', 'synopsis'] # Create an IMDb object ia = Cinemagoer() # Search for the movie by title movies = ia.search_movie(movie_title) if movies: # Get the first movie (assumed to be the correct one) movie = ia.get_movie(movies[0].movieID) # Extract movie attributes # TODO Identify and consider generalizability of extraction patterns movie_data = { "title": movie.get('title', None), "imdbID": movie.get('imdbID', None), "type": movie.get('kind', None), "year": movie.get('year', None), "runtime (min)": movie.get('runtime')[0], "rating": movie.get('rating', None), "votes": movie.get('votes', None), "genres": ', '.join(movie.get('genres', None)), "countries": ', '.join(movie.get('countries', None)), "director": ( movie['director'][0].get('name', None) if 'director' in movie else None ), "writer": ( movie['writer'][0].get('name', None) if 'writer' in movie else None ), "composer": ( movie['composer'][0].get('name', None) if 'composer' in movie else None ), "cast": ', '.join( actor.get('name', None) for actor in movie['cast'] if 'cast' in movie ), "plot": ( movie['plot'][0] if 'plot' in movie else None ), "synopsis": ( movie['synopsis'][0] if 'synopsis' in movie else None ), "plot outline": movie.get('plot outline', None), } # Create a DataFrame film_df = pd.DataFrame([movie_data]) return film_df else: print(f"{movie_title} not found.") film_df = pd.DataFrame( [{movie_field: None for movie_field in movie_fields}] ) return film_df
if __name__ == "__main__": # Comment out (2) to run all tests in script; (1) to run specific tests # doctest.testmod(verbose=True) doctest_function(get_imdb_id, globs=globals()) ## One-off tests pass