Source code for datopy.etl

"""
Tools for efficient web-based data retrieval, data processing,
table creation, and populating empty metadata fields.

.. note:: WIP.

Overview
~~~~~~~~

.. currentmodule:: datopy.etl

.. rubric:: Extract

Utilities for data retrieval.

.. autosummary::
    :nosignatures:

.. rubric:: Transform

Basic data processing and transformation of raw data.

.. autosummary::
    :nosignatures:

    omit_string_patterns

.. rubric:: Load

Utilities related to finding and loading data into a database.

.. autosummary::
    :nosignatures:

    retrieve_wiki_topics

API
~~~
"""

import re
import sys
import pprint
import doctest
import wptools

from datopy.workflow import doctest_function
from datopy.util._numpydoc_validate import numpydoc_validate_module


# -- Extract -----------------------------------------------------------------


# -- Transform ---------------------------------------------------------------


[docs] def omit_string_patterns(input_string: str, patterns: list[str]) -> str: r""" Prune multiple character patterns from a string. Parameters ---------- input_string : str The to-be-cleaned string. patterns : list[str] A list of patterns to omit from the string. Returns ------- str The input string with the supplied patterns ommitted. Examples -------- >>> from datopy.etl import omit_string_patterns >>> input_string = "[[A \\\\ messy * string * with undesirable /patterns]]" >>> patterns_to_omit = ["[[", "]]", "* ", "\\\\ ", "/", "messy ", "un" ] >>> output_string = omit_string_patterns(input_string, patterns_to_omit) >>> print(output_string) A string with desirable patterns """ pattern = '|'.join(re.escape(p) for p in patterns) return re.sub(pattern, '', input_string)
# -- Load -------------------------------------------------------------------- # -- Topic retrieval --------------------------------------------------------- # TODO: take first and last entry (relative indices non-0'ed))
[docs] def retrieve_wiki_topics(listing_page: str, verbose: bool = True) -> list[str]: """ Compile a list of related topics by scraping a Wikipedia page. Parameters ---------- listing_page : str The title of a Wikipedia article containing topics to be retrieved. verbose : bool, default=True Option to enable/disable printouts. Returns ------- list[str] A list of topics (by article name) extracted from the listing page. Notes ----- Only hyperlinked topics (those with a Wikipedia page) are retrieved. Search Wikipedia's catalogue of listing pages here: https://en.wikipedia.org/wiki/List_of_lists_of_lists """ wiki_parse = wptools.page(listing_page).get_parse().data['parsetree'] regex_pattern = re.compile(r"\[\[(.*?)\]\]") matches = regex_pattern.findall(wiki_parse) pages = [match.split('|')[0].strip() for match in matches] target_pages = pages[4:-3] if verbose: pprint.pp(target_pages) return target_pages
if __name__ == "__main__": # Comment out (2) to run all tests in script; (1) to run specific tests # doctest.testmod(verbose=True) # doctest_function(get_film_metadata, globs=globals()) numpydoc_validate_module(sys.modules['__main__']) # One-off tests # listing_page = "List of legendary creatures from China" # retrieve_wiki_topics(listing_page)