initial commit
This commit is contained in:
19
.gitignore
vendored
Normal file
19
.gitignore
vendored
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
*.pyc
|
||||||
|
|
||||||
|
dist/
|
||||||
|
build/
|
||||||
|
*.egg-info/
|
||||||
|
|
||||||
|
.tox/
|
||||||
|
.coverage
|
||||||
|
|
||||||
|
.idea/
|
||||||
|
|
||||||
|
TODO*
|
||||||
|
*.lwp
|
||||||
|
*.ctrl
|
||||||
|
apicache*
|
||||||
|
venv/
|
||||||
|
families*
|
||||||
|
|
||||||
|
|
||||||
73
README.md
Normal file
73
README.md
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
# Wookiee DL
|
||||||
|
|
||||||
|
Command line interface of convenience utilities for use with Wookieepedia
|
||||||
|
(via pywikibot and mediawiki)
|
||||||
|
|
||||||
|
|
||||||
|
# Installation
|
||||||
|
|
||||||
|
Simply run:
|
||||||
|
|
||||||
|
$ pip install .
|
||||||
|
|
||||||
|
|
||||||
|
# Usage
|
||||||
|
|
||||||
|
To use it:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ wookiee-dl --help
|
||||||
|
|
||||||
|
Usage: python -m wookiee-dl [OPTIONS] COMMAND [ARGS]...
|
||||||
|
|
||||||
|
Command line interface of convenience utilities for use with Wookieepedia
|
||||||
|
(via pywikibot and mediawiki)
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--version Show the version and exit.
|
||||||
|
--help Show this message and exit.
|
||||||
|
|
||||||
|
Commands:
|
||||||
|
html Outputs the Wiki page as an HTML document
|
||||||
|
image Download the main image for a page Requires to know the...
|
||||||
|
images Dump all images from a given page.
|
||||||
|
text Return page link in url formats
|
||||||
|
```
|
||||||
|
|
||||||
|
## HTML
|
||||||
|
```
|
||||||
|
Usage: python -m wookiee-dl html [OPTIONS] QUERY
|
||||||
|
|
||||||
|
Outputs the Wiki page as an HTML document
|
||||||
|
|
||||||
|
Options:
|
||||||
|
-y, --top_result Automatically use the top search result
|
||||||
|
-o, --output_directory DIRECTORY
|
||||||
|
-n, --output_filename TEXT
|
||||||
|
--help Show this message and exit.
|
||||||
|
```
|
||||||
|
|
||||||
|
## Images
|
||||||
|
```
|
||||||
|
Usage: python -m wookiee-dl images [OPTIONS] QUERY
|
||||||
|
|
||||||
|
Dump all images from a given page.
|
||||||
|
|
||||||
|
Options:
|
||||||
|
-y, --top_result Automatically use the top search result
|
||||||
|
-o, --output_directory DIRECTORY
|
||||||
|
--help Show this message and exit.
|
||||||
|
```
|
||||||
|
|
||||||
|
## Text
|
||||||
|
```
|
||||||
|
Usage: python -m wookiee-dl text [OPTIONS] QUERY
|
||||||
|
|
||||||
|
Return page link in url formats
|
||||||
|
|
||||||
|
Options:
|
||||||
|
-y, --top_result Automatically use the top search result
|
||||||
|
-f, --format [html|markdown|url]
|
||||||
|
-o, --output FILE
|
||||||
|
--help Show this message and exit.
|
||||||
|
```
|
||||||
48
setup.py
Normal file
48
setup.py
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
"""
|
||||||
|
Tools for downloading info from Wookieepedia
|
||||||
|
"""
|
||||||
|
from setuptools import find_packages, setup
|
||||||
|
|
||||||
|
dependencies = ['click', 'pywikibot', 'pymediawiki', 'mwparserfromhell']
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name='wookieedl',
|
||||||
|
version='0.1.0',
|
||||||
|
url='https://github.com/anthonyscorrea/wookiee-dl',
|
||||||
|
license='BSD',
|
||||||
|
author='Anthony Correa',
|
||||||
|
author_email='a@correa.co',
|
||||||
|
description='Tools for downloading info from Wookieepedia',
|
||||||
|
long_description=__doc__,
|
||||||
|
packages=find_packages(exclude=['tests']),
|
||||||
|
include_package_data=True,
|
||||||
|
zip_safe=False,
|
||||||
|
platforms='any',
|
||||||
|
install_requires=dependencies,
|
||||||
|
entry_points={
|
||||||
|
'console_scripts': [
|
||||||
|
'wookiee-dl = wookiee_dl.cli:cli',
|
||||||
|
],
|
||||||
|
},
|
||||||
|
classifiers=[
|
||||||
|
# As from http://pypi.python.org/pypi?%3Aaction=list_classifiers
|
||||||
|
# 'Development Status :: 1 - Planning',
|
||||||
|
# 'Development Status :: 2 - Pre-Alpha',
|
||||||
|
# 'Development Status :: 3 - Alpha',
|
||||||
|
'Development Status :: 4 - Beta',
|
||||||
|
# 'Development Status :: 5 - Production/Stable',
|
||||||
|
# 'Development Status :: 6 - Mature',
|
||||||
|
# 'Development Status :: 7 - Inactive',
|
||||||
|
'Environment :: Console',
|
||||||
|
'Intended Audience :: Developers',
|
||||||
|
'License :: OSI Approved :: BSD License',
|
||||||
|
'Operating System :: POSIX',
|
||||||
|
'Operating System :: MacOS',
|
||||||
|
'Operating System :: Unix',
|
||||||
|
'Operating System :: Microsoft :: Windows',
|
||||||
|
'Programming Language :: Python',
|
||||||
|
'Programming Language :: Python :: 2',
|
||||||
|
'Programming Language :: Python :: 3',
|
||||||
|
'Topic :: Software Development :: Libraries :: Python Modules',
|
||||||
|
]
|
||||||
|
)
|
||||||
1
tests/__init__.py
Normal file
1
tests/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
|
||||||
74
tests/test_cli.py
Normal file
74
tests/test_cli.py
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
import tempfile
|
||||||
|
|
||||||
|
from click.testing import CliRunner
|
||||||
|
from wookiee_dl import cli
|
||||||
|
import unittest
|
||||||
|
import shutil
|
||||||
|
import pathlib
|
||||||
|
|
||||||
|
|
||||||
|
class TestCli(unittest.TestCase):
|
||||||
|
def setUp(self) -> None:
|
||||||
|
self.tempfile_directory = tempfile.mkdtemp()
|
||||||
|
self.runner = CliRunner()
|
||||||
|
return super().setUp()
|
||||||
|
|
||||||
|
def tearDown(self) -> None:
|
||||||
|
shutil.rmtree(self.tempfile_directory)
|
||||||
|
return super().tearDown()
|
||||||
|
|
||||||
|
def test_cli_001_html_output(self):
|
||||||
|
"""Output html page, check that the file is created. No checks for contents of file."""
|
||||||
|
result = self.runner.invoke(
|
||||||
|
cli.cli,
|
||||||
|
[
|
||||||
|
"html",
|
||||||
|
"--output_directory",
|
||||||
|
self.tempfile_directory,
|
||||||
|
"--top_result",
|
||||||
|
"tarkin family",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
self.assertIsNone(
|
||||||
|
result.exception, f"Exception encountered {result.output.strip()}"
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
result.exit_code,
|
||||||
|
0,
|
||||||
|
f"Exit code not 0 with following output: \n{result.output.strip()}",
|
||||||
|
)
|
||||||
|
path = pathlib.Path(result.output.strip())
|
||||||
|
self.assertEqual((str(path), path.is_file()), (str(path), True))
|
||||||
|
|
||||||
|
def test_cli_003_url(self):
|
||||||
|
"""Print url formatted string."""
|
||||||
|
result = self.runner.invoke(
|
||||||
|
cli.cli,
|
||||||
|
["text", "--format", "url", "--top_result", "tarkin family"],
|
||||||
|
input="".join(["\n"]),
|
||||||
|
)
|
||||||
|
self.assertIsNone(result.exception, f"Exception encountered {result.output}")
|
||||||
|
self.assertEqual(
|
||||||
|
result.exit_code,
|
||||||
|
0,
|
||||||
|
f"Exit code not 0 with following output: \n{result.output}",
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
"https://starwars.fandom.com/wiki/Tarkin_family", result.output.strip()
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_cli_005_images(self):
|
||||||
|
"""Dump all images from page to folder. Checks folder was created, no checks for contents of folder."""
|
||||||
|
result = self.runner.invoke(
|
||||||
|
cli.cli,
|
||||||
|
["images", "tarkin family", "-o", self.tempfile_directory, "--top_result"],
|
||||||
|
input="".join(["\n"]),
|
||||||
|
)
|
||||||
|
self.assertIsNone(result.exception, f"Exception encountered {result.output}")
|
||||||
|
self.assertEqual(
|
||||||
|
result.exit_code,
|
||||||
|
0,
|
||||||
|
f"Exit code not 0 with following output: \n{result.output}",
|
||||||
|
)
|
||||||
|
path = pathlib.Path(result.output.strip())
|
||||||
|
self.assertEqual((str(path), path.is_dir()), (str(path), True))
|
||||||
36
tox.ini
Normal file
36
tox.ini
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
# tox (https://tox.readthedocs.io/) is a tool for running tests
|
||||||
|
# in multiple virtualenvs. This configuration file will run the
|
||||||
|
# test suite on all supported python versions. To use it, "pip install tox"
|
||||||
|
# and then run "tox" from this directory.
|
||||||
|
|
||||||
|
[tox]
|
||||||
|
envlist = py310,black,flake
|
||||||
|
|
||||||
|
[flake8]
|
||||||
|
max-line-length = 160
|
||||||
|
|
||||||
|
[testenv]
|
||||||
|
allowlist_externals = rm
|
||||||
|
deps =
|
||||||
|
|
||||||
|
commands =
|
||||||
|
;Generate pywikibot family file
|
||||||
|
python -m pywikibot.scripts.generate_family_file "https://starwars.fandom.com/wiki/Main_Page" "wookieepedia" "n" "n"
|
||||||
|
python -m unittest discover
|
||||||
|
;remove pywikibot family file
|
||||||
|
rm -rf ./families
|
||||||
|
|
||||||
|
[testenv:black]
|
||||||
|
deps =
|
||||||
|
black
|
||||||
|
|
||||||
|
commands =
|
||||||
|
python -m black wookiee_dl
|
||||||
|
python -m black tests
|
||||||
|
|
||||||
|
[testenv:flake]
|
||||||
|
deps =
|
||||||
|
flake8
|
||||||
|
|
||||||
|
commands =
|
||||||
|
python -m flake8 wookiee_dl
|
||||||
0
wookiee_dl/__init__.py
Normal file
0
wookiee_dl/__init__.py
Normal file
193
wookiee_dl/cli.py
Normal file
193
wookiee_dl/cli.py
Normal file
@@ -0,0 +1,193 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
import click
|
||||||
|
import mediawiki
|
||||||
|
from pywikibot import FilePage, Page, Site
|
||||||
|
|
||||||
|
site = Site("en", "wookieepedia")
|
||||||
|
site_mw = mediawiki.MediaWiki("https://starwars.fandom.com/api.php")
|
||||||
|
|
||||||
|
|
||||||
|
def search_for_page_obj(query: str, num_results=10, top_result=False) -> Page:
|
||||||
|
base_url = "https://starwars.fandom.com/wiki/"
|
||||||
|
if base_url in query:
|
||||||
|
page = query.replace(base_url, "")
|
||||||
|
else:
|
||||||
|
search_results = site_mw.search(query, results=num_results)
|
||||||
|
|
||||||
|
if top_result:
|
||||||
|
page = search_results[0]
|
||||||
|
else:
|
||||||
|
choices = click.Choice(
|
||||||
|
[result for result in search_results], case_sensitive=False
|
||||||
|
)
|
||||||
|
page = click.prompt(
|
||||||
|
"Which page do you want?",
|
||||||
|
type=choices,
|
||||||
|
show_choices=True,
|
||||||
|
default=search_results[0],
|
||||||
|
)
|
||||||
|
|
||||||
|
return Page(site, page)
|
||||||
|
|
||||||
|
|
||||||
|
query_argument = click.argument("query", required=True)
|
||||||
|
top_result_argument = click.option(
|
||||||
|
"--top_result",
|
||||||
|
"-y",
|
||||||
|
is_flag=True,
|
||||||
|
default=False,
|
||||||
|
help="Automatically use the top search result",
|
||||||
|
)
|
||||||
|
output_directory_option = click.option(
|
||||||
|
"-o",
|
||||||
|
"--output_directory",
|
||||||
|
type=click.Path(file_okay=False, writable=True, exists=True),
|
||||||
|
default=".",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@click.group(invoke_without_command=True)
|
||||||
|
@click.version_option()
|
||||||
|
@click.pass_context
|
||||||
|
def cli(
|
||||||
|
ctx: click.Context,
|
||||||
|
):
|
||||||
|
"""Command line interface of convenience utilities for use with Wookieepedia (via pywikibot and mediawiki)"""
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command(name="image")
|
||||||
|
@click.pass_context
|
||||||
|
@query_argument
|
||||||
|
@top_result_argument
|
||||||
|
@output_directory_option
|
||||||
|
@click.argument(
|
||||||
|
"template",
|
||||||
|
type=click.Choice(
|
||||||
|
["weapon", "character", "species", "starship_class", "family", "Ship_series"]
|
||||||
|
),
|
||||||
|
required=True,
|
||||||
|
)
|
||||||
|
def image(ctx, page, template, top_result):
|
||||||
|
"""Download the main image for a page
|
||||||
|
Requires to know the template of the main info block on the page.
|
||||||
|
Currently not implemented
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
page = search_for_page_obj(page, top_result=top_result)
|
||||||
|
block = [
|
||||||
|
parameters_list
|
||||||
|
for template_page, parameters_list in page.templatesWithParams()
|
||||||
|
if template_page.title() == f"Template:{template.capitalize()}"
|
||||||
|
]
|
||||||
|
# split_block = (s.split("=") for s in block[0])
|
||||||
|
params = {k.split("=")[0]: k.split("=")[1] for k in block[0]}
|
||||||
|
|
||||||
|
image_page = Page(site, params["image"].replace("[[", "").replace("]]", ""))
|
||||||
|
|
||||||
|
if image_page.is_filepage():
|
||||||
|
if 0: # was "not filename"
|
||||||
|
filename = f"{image_page.title(as_filename=True, with_ns=False)}"
|
||||||
|
if dir:
|
||||||
|
os.makedirs(dir, exist_ok=True)
|
||||||
|
filename = os.path.join(dir, filename)
|
||||||
|
image_page = FilePage(image_page)
|
||||||
|
image_page.download(filename)
|
||||||
|
else:
|
||||||
|
print("no image found")
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command(name="html")
|
||||||
|
@click.pass_context
|
||||||
|
@query_argument
|
||||||
|
@top_result_argument
|
||||||
|
@output_directory_option
|
||||||
|
@click.option("-n", "--output_filename", type=str)
|
||||||
|
def html_dump(
|
||||||
|
ctx: click.Context,
|
||||||
|
query: str,
|
||||||
|
top_result: bool,
|
||||||
|
output_filename: str,
|
||||||
|
output_directory: click.Path,
|
||||||
|
) -> None:
|
||||||
|
"""Outputs the Wiki page as an HTML document"""
|
||||||
|
query = search_for_page_obj(query, top_result=top_result)
|
||||||
|
format = "html"
|
||||||
|
|
||||||
|
page_mw = site_mw.page(query.title())
|
||||||
|
if not output_filename: # was if not filename
|
||||||
|
filename = f"{query.title(as_filename=True)}.{format}"
|
||||||
|
if dir:
|
||||||
|
os.makedirs(output_directory, exist_ok=True)
|
||||||
|
filename = os.path.join(output_directory, filename)
|
||||||
|
|
||||||
|
doc = page_mw.html
|
||||||
|
with open(filename, "w") as f:
|
||||||
|
f.write(doc)
|
||||||
|
|
||||||
|
click.echo(filename)
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command(name="images")
|
||||||
|
@click.pass_context
|
||||||
|
@query_argument
|
||||||
|
@top_result_argument
|
||||||
|
@output_directory_option
|
||||||
|
def dump_all_images(
|
||||||
|
ctx: click.Context, query: str, top_result, output_directory: click.Path
|
||||||
|
) -> None:
|
||||||
|
"""Dump all images from a given page."""
|
||||||
|
page = search_for_page_obj(query, top_result=top_result)
|
||||||
|
destination = os.path.join(output_directory, page.title(as_filename=True))
|
||||||
|
os.makedirs(destination, exist_ok=True)
|
||||||
|
|
||||||
|
imagelinks = page.imagelinks()
|
||||||
|
for image in imagelinks:
|
||||||
|
filename = os.path.join(
|
||||||
|
destination, image.title(as_filename=True, with_ns=False)
|
||||||
|
)
|
||||||
|
image.download(filename)
|
||||||
|
|
||||||
|
click.echo(destination)
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command(name="text")
|
||||||
|
@click.pass_context
|
||||||
|
@query_argument
|
||||||
|
@top_result_argument
|
||||||
|
@click.option(
|
||||||
|
"-f",
|
||||||
|
"--format",
|
||||||
|
type=click.Choice(["html", "markdown", "url"], case_sensitive=False),
|
||||||
|
prompt=True,
|
||||||
|
default="url",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-o",
|
||||||
|
"--output",
|
||||||
|
"output_file",
|
||||||
|
type=click.Path(exists=False, dir_okay=False, allow_dash=True, writable=True),
|
||||||
|
default="-",
|
||||||
|
)
|
||||||
|
def output_text(
|
||||||
|
ctx: click.Context, query: str, top_result, format, output_file
|
||||||
|
) -> None:
|
||||||
|
"""Return page link in url formats"""
|
||||||
|
query = search_for_page_obj(query, top_result=top_result)
|
||||||
|
|
||||||
|
if format == "markdown":
|
||||||
|
string_format = "[{title} on Wookieepedia]({url})"
|
||||||
|
elif format == "html":
|
||||||
|
string_format = '<a href="{url}>{title}" on Wookieepedia</a>'
|
||||||
|
elif format == "url":
|
||||||
|
string_format = "{url}"
|
||||||
|
else:
|
||||||
|
string_format = ""
|
||||||
|
|
||||||
|
with click.open_file(output_file, "w") as f:
|
||||||
|
f.write(string_format.format(title=query.title().title(), url=query.full_url()))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
cli()
|
||||||
Reference in New Issue
Block a user