Source code for datopy.etl
"""
Tools for efficient web-based data retrieval, data processing,
table creation, and populating empty metadata fields.
.. note:: WIP.
Overview
~~~~~~~~
.. currentmodule:: datopy.etl
.. rubric:: Extract
Utilities for data retrieval.
.. autosummary::
:nosignatures:
.. rubric:: Transform
Basic data processing and transformation of raw data.
.. autosummary::
:nosignatures:
omit_string_patterns
.. rubric:: Load
Utilities related to finding and loading data into a database.
.. autosummary::
:nosignatures:
retrieve_wiki_topics
API
~~~
"""
import re
import sys
import pprint
import doctest
import wptools
from datopy.workflow import doctest_function
from datopy.util._numpydoc_validate import numpydoc_validate_module
# -- Extract -----------------------------------------------------------------
# -- Transform ---------------------------------------------------------------
[docs]
def omit_string_patterns(input_string: str, patterns: list[str]) -> str:
r"""
Prune multiple character patterns from a string.
Parameters
----------
input_string : str
The to-be-cleaned string.
patterns : list[str]
A list of patterns to omit from the string.
Returns
-------
str
The input string with the supplied patterns ommitted.
Examples
--------
>>> from datopy.etl import omit_string_patterns
>>> input_string = "[[A \\\\ messy * string * with undesirable /patterns]]"
>>> patterns_to_omit = ["[[", "]]", "* ", "\\\\ ", "/", "messy ", "un" ]
>>> output_string = omit_string_patterns(input_string, patterns_to_omit)
>>> print(output_string)
A string with desirable patterns
"""
pattern = '|'.join(re.escape(p) for p in patterns)
return re.sub(pattern, '', input_string)
# -- Load --------------------------------------------------------------------
# -- Topic retrieval ---------------------------------------------------------
# TODO: take first and last entry (relative indices non-0'ed))
[docs]
def retrieve_wiki_topics(listing_page: str, verbose: bool = True) -> list[str]:
"""
Compile a list of related topics by scraping a Wikipedia page.
Parameters
----------
listing_page : str
The title of a Wikipedia article containing topics to be retrieved.
verbose : bool, default=True
Option to enable/disable printouts.
Returns
-------
list[str]
A list of topics (by article name) extracted from the listing page.
Notes
-----
Only hyperlinked topics (those with a Wikipedia page) are retrieved.
Search Wikipedia's catalogue of listing pages here:
https://en.wikipedia.org/wiki/List_of_lists_of_lists
"""
wiki_parse = wptools.page(listing_page).get_parse().data['parsetree']
regex_pattern = re.compile(r"\[\[(.*?)\]\]")
matches = regex_pattern.findall(wiki_parse)
pages = [match.split('|')[0].strip() for match in matches]
target_pages = pages[4:-3]
if verbose:
pprint.pp(target_pages)
return target_pages
if __name__ == "__main__":
# Comment out (2) to run all tests in script; (1) to run specific tests
# doctest.testmod(verbose=True)
# doctest_function(get_film_metadata, globs=globals())
numpydoc_validate_module(sys.modules['__main__'])
# One-off tests
# listing_page = "List of legendary creatures from China"
# retrieve_wiki_topics(listing_page)