From 0a3759dbf3d72569809782857fb488c9fc5e50f2 Mon Sep 17 00:00:00 2001 From: asc Date: Tue, 25 Oct 2022 11:45:25 -0500 Subject: [PATCH] initial commit --- .gitignore | 19 ++++ README.md | 73 ++++++++++++++++ setup.cfg | 3 + setup.py | 48 ++++++++++ tests/__init__.py | 1 + tests/test_cli.py | 74 ++++++++++++++++ tox.ini | 36 ++++++++ wookiee_dl/__init__.py | 0 wookiee_dl/cli.py | 193 +++++++++++++++++++++++++++++++++++++++++ 9 files changed, 447 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 setup.cfg create mode 100644 setup.py create mode 100644 tests/__init__.py create mode 100644 tests/test_cli.py create mode 100644 tox.ini create mode 100644 wookiee_dl/__init__.py create mode 100644 wookiee_dl/cli.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9605f2c --- /dev/null +++ b/.gitignore @@ -0,0 +1,19 @@ +*.pyc + +dist/ +build/ +*.egg-info/ + +.tox/ +.coverage + +.idea/ + +TODO* +*.lwp +*.ctrl +apicache* +venv/ +families* + + diff --git a/README.md b/README.md new file mode 100644 index 0000000..9b764ca --- /dev/null +++ b/README.md @@ -0,0 +1,73 @@ +# Wookiee DL + +Command line interface of convenience utilities for use with Wookieepedia + (via pywikibot and mediawiki) + + +# Installation + +Simply run: + + $ pip install . + + +# Usage + +To use it: + +``` +$ wookiee-dl --help + +Usage: python -m wookiee-dl [OPTIONS] COMMAND [ARGS]... + + Command line interface of convenience utilities for use with Wookieepedia + (via pywikibot and mediawiki) + +Options: + --version Show the version and exit. + --help Show this message and exit. + +Commands: + html Outputs the Wiki page as an HTML document + image Download the main image for a page Requires to know the... + images Dump all images from a given page. + text Return page link in url formats +``` + +## HTML +``` +Usage: python -m wookiee-dl html [OPTIONS] QUERY + + Outputs the Wiki page as an HTML document + +Options: + -y, --top_result Automatically use the top search result + -o, --output_directory DIRECTORY + -n, --output_filename TEXT + --help Show this message and exit. +``` + +## Images +``` +Usage: python -m wookiee-dl images [OPTIONS] QUERY + + Dump all images from a given page. + +Options: + -y, --top_result Automatically use the top search result + -o, --output_directory DIRECTORY + --help Show this message and exit. +``` + +## Text +``` +Usage: python -m wookiee-dl text [OPTIONS] QUERY + + Return page link in url formats + +Options: + -y, --top_result Automatically use the top search result + -f, --format [html|markdown|url] + -o, --output FILE + --help Show this message and exit. +``` \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..8951ea8 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,3 @@ +[wheel] +universal = 1 + diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..bcc3dc5 --- /dev/null +++ b/setup.py @@ -0,0 +1,48 @@ +""" +Tools for downloading info from Wookieepedia +""" +from setuptools import find_packages, setup + +dependencies = ['click', 'pywikibot', 'pymediawiki', 'mwparserfromhell'] + +setup( + name='wookieedl', + version='0.1.0', + url='https://github.com/anthonyscorrea/wookiee-dl', + license='BSD', + author='Anthony Correa', + author_email='a@correa.co', + description='Tools for downloading info from Wookieepedia', + long_description=__doc__, + packages=find_packages(exclude=['tests']), + include_package_data=True, + zip_safe=False, + platforms='any', + install_requires=dependencies, + entry_points={ + 'console_scripts': [ + 'wookiee-dl = wookiee_dl.cli:cli', + ], + }, + classifiers=[ + # As from http://pypi.python.org/pypi?%3Aaction=list_classifiers + # 'Development Status :: 1 - Planning', + # 'Development Status :: 2 - Pre-Alpha', + # 'Development Status :: 3 - Alpha', + 'Development Status :: 4 - Beta', + # 'Development Status :: 5 - Production/Stable', + # 'Development Status :: 6 - Mature', + # 'Development Status :: 7 - Inactive', + 'Environment :: Console', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: BSD License', + 'Operating System :: POSIX', + 'Operating System :: MacOS', + 'Operating System :: Unix', + 'Operating System :: Microsoft :: Windows', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 3', + 'Topic :: Software Development :: Libraries :: Python Modules', + ] +) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..d20c593 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,74 @@ +import tempfile + +from click.testing import CliRunner +from wookiee_dl import cli +import unittest +import shutil +import pathlib + + +class TestCli(unittest.TestCase): + def setUp(self) -> None: + self.tempfile_directory = tempfile.mkdtemp() + self.runner = CliRunner() + return super().setUp() + + def tearDown(self) -> None: + shutil.rmtree(self.tempfile_directory) + return super().tearDown() + + def test_cli_001_html_output(self): + """Output html page, check that the file is created. No checks for contents of file.""" + result = self.runner.invoke( + cli.cli, + [ + "html", + "--output_directory", + self.tempfile_directory, + "--top_result", + "tarkin family", + ], + ) + self.assertIsNone( + result.exception, f"Exception encountered {result.output.strip()}" + ) + self.assertEqual( + result.exit_code, + 0, + f"Exit code not 0 with following output: \n{result.output.strip()}", + ) + path = pathlib.Path(result.output.strip()) + self.assertEqual((str(path), path.is_file()), (str(path), True)) + + def test_cli_003_url(self): + """Print url formatted string.""" + result = self.runner.invoke( + cli.cli, + ["text", "--format", "url", "--top_result", "tarkin family"], + input="".join(["\n"]), + ) + self.assertIsNone(result.exception, f"Exception encountered {result.output}") + self.assertEqual( + result.exit_code, + 0, + f"Exit code not 0 with following output: \n{result.output}", + ) + self.assertEqual( + "https://starwars.fandom.com/wiki/Tarkin_family", result.output.strip() + ) + + def test_cli_005_images(self): + """Dump all images from page to folder. Checks folder was created, no checks for contents of folder.""" + result = self.runner.invoke( + cli.cli, + ["images", "tarkin family", "-o", self.tempfile_directory, "--top_result"], + input="".join(["\n"]), + ) + self.assertIsNone(result.exception, f"Exception encountered {result.output}") + self.assertEqual( + result.exit_code, + 0, + f"Exit code not 0 with following output: \n{result.output}", + ) + path = pathlib.Path(result.output.strip()) + self.assertEqual((str(path), path.is_dir()), (str(path), True)) diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..a470058 --- /dev/null +++ b/tox.ini @@ -0,0 +1,36 @@ +# tox (https://tox.readthedocs.io/) is a tool for running tests +# in multiple virtualenvs. This configuration file will run the +# test suite on all supported python versions. To use it, "pip install tox" +# and then run "tox" from this directory. + +[tox] +envlist = py310,black,flake + +[flake8] +max-line-length = 160 + +[testenv] +allowlist_externals = rm +deps = + +commands = + ;Generate pywikibot family file + python -m pywikibot.scripts.generate_family_file "https://starwars.fandom.com/wiki/Main_Page" "wookieepedia" "n" "n" + python -m unittest discover + ;remove pywikibot family file + rm -rf ./families + +[testenv:black] +deps = + black + +commands = + python -m black wookiee_dl + python -m black tests + +[testenv:flake] +deps = + flake8 + +commands = + python -m flake8 wookiee_dl diff --git a/wookiee_dl/__init__.py b/wookiee_dl/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/wookiee_dl/cli.py b/wookiee_dl/cli.py new file mode 100644 index 0000000..c2d8ea3 --- /dev/null +++ b/wookiee_dl/cli.py @@ -0,0 +1,193 @@ +import os + +import click +import mediawiki +from pywikibot import FilePage, Page, Site + +site = Site("en", "wookieepedia") +site_mw = mediawiki.MediaWiki("https://starwars.fandom.com/api.php") + + +def search_for_page_obj(query: str, num_results=10, top_result=False) -> Page: + base_url = "https://starwars.fandom.com/wiki/" + if base_url in query: + page = query.replace(base_url, "") + else: + search_results = site_mw.search(query, results=num_results) + + if top_result: + page = search_results[0] + else: + choices = click.Choice( + [result for result in search_results], case_sensitive=False + ) + page = click.prompt( + "Which page do you want?", + type=choices, + show_choices=True, + default=search_results[0], + ) + + return Page(site, page) + + +query_argument = click.argument("query", required=True) +top_result_argument = click.option( + "--top_result", + "-y", + is_flag=True, + default=False, + help="Automatically use the top search result", +) +output_directory_option = click.option( + "-o", + "--output_directory", + type=click.Path(file_okay=False, writable=True, exists=True), + default=".", +) + + +@click.group(invoke_without_command=True) +@click.version_option() +@click.pass_context +def cli( + ctx: click.Context, +): + """Command line interface of convenience utilities for use with Wookieepedia (via pywikibot and mediawiki)""" + + +@cli.command(name="image") +@click.pass_context +@query_argument +@top_result_argument +@output_directory_option +@click.argument( + "template", + type=click.Choice( + ["weapon", "character", "species", "starship_class", "family", "Ship_series"] + ), + required=True, +) +def image(ctx, page, template, top_result): + """Download the main image for a page + Requires to know the template of the main info block on the page. + Currently not implemented + """ + raise NotImplementedError + page = search_for_page_obj(page, top_result=top_result) + block = [ + parameters_list + for template_page, parameters_list in page.templatesWithParams() + if template_page.title() == f"Template:{template.capitalize()}" + ] + # split_block = (s.split("=") for s in block[0]) + params = {k.split("=")[0]: k.split("=")[1] for k in block[0]} + + image_page = Page(site, params["image"].replace("[[", "").replace("]]", "")) + + if image_page.is_filepage(): + if 0: # was "not filename" + filename = f"{image_page.title(as_filename=True, with_ns=False)}" + if dir: + os.makedirs(dir, exist_ok=True) + filename = os.path.join(dir, filename) + image_page = FilePage(image_page) + image_page.download(filename) + else: + print("no image found") + pass + + +@cli.command(name="html") +@click.pass_context +@query_argument +@top_result_argument +@output_directory_option +@click.option("-n", "--output_filename", type=str) +def html_dump( + ctx: click.Context, + query: str, + top_result: bool, + output_filename: str, + output_directory: click.Path, +) -> None: + """Outputs the Wiki page as an HTML document""" + query = search_for_page_obj(query, top_result=top_result) + format = "html" + + page_mw = site_mw.page(query.title()) + if not output_filename: # was if not filename + filename = f"{query.title(as_filename=True)}.{format}" + if dir: + os.makedirs(output_directory, exist_ok=True) + filename = os.path.join(output_directory, filename) + + doc = page_mw.html + with open(filename, "w") as f: + f.write(doc) + + click.echo(filename) + + +@cli.command(name="images") +@click.pass_context +@query_argument +@top_result_argument +@output_directory_option +def dump_all_images( + ctx: click.Context, query: str, top_result, output_directory: click.Path +) -> None: + """Dump all images from a given page.""" + page = search_for_page_obj(query, top_result=top_result) + destination = os.path.join(output_directory, page.title(as_filename=True)) + os.makedirs(destination, exist_ok=True) + + imagelinks = page.imagelinks() + for image in imagelinks: + filename = os.path.join( + destination, image.title(as_filename=True, with_ns=False) + ) + image.download(filename) + + click.echo(destination) + + +@cli.command(name="text") +@click.pass_context +@query_argument +@top_result_argument +@click.option( + "-f", + "--format", + type=click.Choice(["html", "markdown", "url"], case_sensitive=False), + prompt=True, + default="url", +) +@click.option( + "-o", + "--output", + "output_file", + type=click.Path(exists=False, dir_okay=False, allow_dash=True, writable=True), + default="-", +) +def output_text( + ctx: click.Context, query: str, top_result, format, output_file +) -> None: + """Return page link in url formats""" + query = search_for_page_obj(query, top_result=top_result) + + if format == "markdown": + string_format = "[{title} on Wookieepedia]({url})" + elif format == "html": + string_format = '' + elif format == "url": + string_format = "{url}" + else: + string_format = "" + + with click.open_file(output_file, "w") as f: + f.write(string_format.format(title=query.title().title(), url=query.full_url())) + + +if __name__ == "__main__": + cli()