initial commit

This commit is contained in:
2023-12-31 14:28:02 -06:00
commit 4105cc2373
9 changed files with 653 additions and 0 deletions

0
.env Normal file
View File

3
.gitignore vendored Normal file
View File

@@ -0,0 +1,3 @@
venv
data
__pycache__/

View File

View File

@@ -0,0 +1,2 @@
from .convert_to_sportspress import app
app()

View File

@@ -0,0 +1,238 @@
import csv
import re
from typing import List, Dict
from dateutil import parser
from pathlib import Path
from rich.console import Console
from rich.panel import Panel
from rich.table import Table, Column
from rich.columns import Columns
import typer
from .utils import normalize_header_key, validate_csv_header, read_csv, is_visitor_home_order_reversed, process_data, aggregate_teams, write_sportspress_csv
app = typer.Typer()
@app.command()
def standings(file_path: Path = typer.Argument(..., help="Path to the CSV file")):
# Validate CSV header
header = next(csv.reader(open(file_path, "r")))
normalized_header = [normalize_header_key(key) for key in header]
if not validate_csv_header(header):
typer.echo("Error: Invalid CSV header. Make sure the CSV file contains the correct headers.")
return
# Read CSV data
data = read_csv(file_path)
visitor_home_order_reversed = is_visitor_home_order_reversed(normalized_header)
processed_data = process_data(data, visitor_home_order_reversed)
aggregate_team_data = aggregate_teams(processed_data)
# Display aggregated data as a table
console = Console()
table = Table(title="Aggregated Team Data")
table.add_column("Team", style="bold")
table.add_column("Wins", style="bold")
table.add_column("Losses", style="bold")
table.add_column("Ties", style="bold")
table.add_column("Runs For", style="bold")
table.add_column("Runs Against", style="bold")
for team_stats in aggregate_team_data:
table.add_row(
team_stats["team"],
str(team_stats["win"]),
str(team_stats["loss"]),
str(team_stats["tie"]),
str(team_stats["runs_for"]),
str(team_stats["runs_against"]),
)
console.print(table)
# Write processed CSV data back to a new file
# output_file_path = file_path.with_suffix(".processed.csv")
# write_csv(output_file_path, data)
# typer.echo(f"Processed data written to: {output_file_path}")
@app.command()
def sportspress_csv(file_path: Path = typer.Argument(..., help="Path to the CSV file"), file_output_path: Path = typer.Argument(..., help="Path to the output CSV file"), only_with_outcome: bool = typer.Option(default=True, is_flag=True, help="")):
# Validate CSV header
header = next(csv.reader(open(file_path, "r")))
normalized_header = [normalize_header_key(key) for key in header]
if not validate_csv_header(header):
typer.echo("Error: Invalid CSV header. Make sure the CSV file contains the correct headers.")
return
# Read CSV data
data = read_csv(file_path)
visitor_home_order_reversed = is_visitor_home_order_reversed(normalized_header)
processed_data = process_data(data, visitor_home_order_reversed)
write_sportspress_csv(processed_data, file_output_path, only_with_outcome)
typer.echo(f"Output to {file_output_path}")
def list_key_values(data: List[Dict], key):
if key.lower() == "team":
normalized_key = "team"
else:
normalized_key = normalize_header_key(key)
if normalized_key != "team" or "team" in data[0].keys():
output = {row.get(normalized_key) for row in data}
else:
output = {row.get('home') for row in data}
output = output | {row.get('visitor') for row in data}
return output
def replace_key_values(data: List[Dict], key, match:str, replace:str, is_regex:bool =False):
if not is_regex:
regex = re.compile(fr"^{match}$")
else:
regex = re.compile(fr"{match}")
for row in data:
row[key] = regex.sub(replace, row[key])
return data
def add_key_values(data: List[Dict], key, value:str):
for row in data:
row[key] = value
return data
clean_app = typer.Typer()
@clean_app.command("list")
def print_values_for_key(file_path: Path = typer.Argument(..., help="Path to the CSV file"), key: str = typer.Argument(..., help="")):
# Read CSV data
data = read_csv(file_path)
processed_data = list_key_values(data, key)
console = Console()
table = Table(show_header=False)
table.add_column("Values")
for value in sorted(processed_data):
table.add_row(value)
console.print(table)
@clean_app.command("replace")
def replace_values_for_key(
file_path: Path = typer.Argument(..., help="Path to the CSV file"),
key: str = typer.Argument(..., help=""),
match: str = typer.Argument(..., help=""),
replace: str = typer.Argument(..., help=""),
in_place: bool = typer.Option(False, "--in-place", "-p", help="Modify file in place."),
output_file: Path = typer.Option(None, "--output-file", "-o", help="Specify output file."),
match_is_regex: bool = typer.Option(False, "--regex", "-p", help="Match is a regex pattern.")
):
if in_place and output_file:
typer.echo("Error: Only one of --in-place or --output-file should be provided, not both.")
raise typer.Abort()
if key.lower() == "team":
normalized_key = "team"
else:
normalized_key = normalize_header_key(key)
console = Console()
# Read CSV data
data = read_csv(file_path)
before_table = Table(Column(), show_header=False, title="Before")
for value in sorted(list_key_values(data, key)):
before_table.add_row(value)
after_table = Table( Column(), show_header=False, title="After")
if normalized_key != "team" or "team" in data[0].keys():
data = replace_key_values(data, normalized_key, match, replace, match_is_regex)
else:
data=replace_key_values(data, "home", match, replace, match_is_regex)
data=replace_key_values(data, "visitor", match, replace, match_is_regex)
for value in sorted(list_key_values(data, key)):
after_table.add_row(value)
panel = Panel(
Columns([before_table, after_table]),
title="Replace"
)
console.print(panel)
if in_place and typer.confirm("Perform Replacement in-place?"):
with file_path.open('w') as f:
fieldnames = data[0].keys()
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(data)
elif output_file:
if output_file.is_dir():
output_file = output_file.joinpath(file_path.name)
if typer.confirm(f"Write to {output_file}?"):
with output_file.open('w') as f:
fieldnames = data[0].keys()
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(data)
@clean_app.command("add-key")
def add_values_for_key(
file_path: Path = typer.Argument(..., help="Path to the CSV file"),
key: str = typer.Argument(..., help=""),
value: str = typer.Argument("", help=""),
in_place: bool = typer.Option(False, "--in-place", "-p", help="Modify file in place."),
output_file: Path = typer.Option(None, "--output-file", "-o", help="Specify output file."),
):
if in_place and output_file:
typer.echo("Error: Only one of --in-place or --output-file should be provided, not both.")
raise typer.Abort()
# Validate CSV header
header = next(csv.reader(open(file_path, "r")))
normalized_header = [normalize_header_key(key) for key in header]
if key.lower() == "team":
normalized_key = "team"
else:
normalized_key = normalize_header_key(key)
if not validate_csv_header(header):
typer.echo("Error: Invalid CSV header. Make sure the CSV file contains the correct headers.")
return
console = Console()
# Read CSV data
data = read_csv(file_path)
data = add_key_values(data, key, value)
if in_place and typer.confirm("Perform Replacement in-place?"):
with file_path.open('w') as f:
fieldnames = data[0].keys()
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(data)
elif output_file:
if output_file.is_dir():
output_file = output_file.joinpath(file_path.name)
if typer.confirm(f"Write to {output_file}?"):
with output_file.open('w') as f:
fieldnames = data[0].keys()
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(data)
app.add_typer(clean_app, name="clean")
if __name__ == "__main__":
app()

View File

@@ -0,0 +1,263 @@
import csv
import re
from typing import List, Dict
from dateutil import parser
from pathlib import Path
from rich.console import Console
from rich.table import Table
def normalize_header_key(key: str) -> str:
key_mapping = {
"away": "visitor",
"results": "results",
"final score": "results",
"venue": "field",
"location":"field",
"result": "results",
"w":"win",
"l":"loss",
"t":"tie",
"div":"division",
"rf":"runs_for",
"runs":"runs_against"
}
return key_mapping.get(key.lower().strip(), key.lower().strip())
def validate_csv_header(header: List[str]) -> bool:
required_keys = ["date", "time", "field", "visitor", "home", "results"]
normalized_header = [normalize_header_key(key) for key in header]
return all(key in normalized_header for key in required_keys)
def read_csv(file_path: Path) -> List[dict]:
data = []
with open(file_path, "r", newline="") as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
normalized_row = {normalize_header_key(key): value.strip() for key, value in row.items()}
data.append(normalized_row)
return data
def write_csv(file_path: Path, data: List[dict]) -> None:
with open(file_path, "w", newline="") as csvfile:
fieldnames = data[0].keys()
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(data)
def parse_score(score_str: str, reverse_order: bool = False) -> Dict[str, int]:
"""
Parse a score string and extract home and visitor scores.
Args:
score_str (str): The score string contain somewhere "visitor-home".
reverse_order (bool, optional): If True, the order of the scores is reversed (home first).
Defaults to False.
Returns:
Dict[str, int]: A dictionary containing home and visitor scores.
"""
regex = re.compile(r"^(?P<pre>.*?)?(?:(?P<runs_first>\d+)-(?P<runs_second>\d+))?(?P<post>.*?)?$")
match = regex.match(score_str)
if match:
score = {}
if match.group("pre"): score["pre"] = match.group("pre")
if match.group("post"): score["post"] = match.group("post")
if match.group("runs_first") and match.group("runs_second"):
score['has_result'] = True
runs_first, runs_second = int(match.group("runs_first")), int(match.group("runs_second"))
if not reverse_order:
score.update({
"home_runs_for": runs_second, "visitor_runs_for": runs_first,
"home_runs_against":runs_first, "visitor_runs_against": runs_second
})
elif reverse_order:
score.update({
"home_runs_for": runs_first, "visitor_runs_for": runs_second,
"home_runs_against": runs_second, "visitor_runs_against": runs_first
})
if score["home_runs_for"] > score["visitor_runs_for"]:
score["home_outcome"] = "win"
score["visitor_outcome"] = "loss"
if "forfeit" in score.get("post",""):
score["visitor_outcome"] = "forfeit"
elif score["home_runs_for"] < score["visitor_runs_for"]:
score["home_outcome"] = "loss"
score["visitor_outcome"] = "win"
if "forfeit" in score.get("post",""):
score["home_outcome"] = "forfeit"
else:
score["home_outcome"] = "tie"
score["visitor_outcome"] = "tie"
else:
score['has_result'] = False
return score
raise ValueError("Invalid score format")
def is_visitor_home_order_reversed(header: List[str]) -> bool:
"""
Determine if the order of 'visitor' and 'home' in the header suggests reversed order.
convention is that home is second.
Args:
header (List[str]): The list of header keys.
Returns:
bool: True if the 'home' key comes before the 'visitor' key, indicating reversed order.
"""
return header.index('visitor') > header.index('home')
def process_data(data: List[Dict], visitor_home_order_reversed = False) -> List[Dict]:
for row in data:
parsed_score = parse_score(row["results"], visitor_home_order_reversed)
row.update(parsed_score)
try:
row['datetime'] = parser.parse(f"{row['date']} {row['time']}")
except parser.ParserError as e:
raise e
return data
def aggregate_teams(data: List[Dict[str, str]]) -> List[Dict[str, int]]:
"""
Aggregate data by team, summing up wins, losses, and ties.
Args:
data (List[Dict[str, str]]): A list of dictionaries representing the CSV data.
Returns:
List[Dict[str, int]]: A list of dictionaries containing aggregated data for each team.
"""
team_stats = {}
for row in data:
if not row["has_result"]:
continue
home_team = row["home"]
visitor_team = row["visitor"]
team_stats.setdefault(home_team, {"win": 0, "loss": 0, "tie": 0, "gp": 0, "runs_for": 0, "runs_against":0})
team_stats.setdefault(visitor_team, {"win": 0, "loss": 0, "tie": 0, "gp": 0, "runs_for": 0, "runs_against":0})
team_stats[home_team]['gp'] += 1
team_stats[visitor_team]['gp'] += 1
for outcome in ["win", "loss", "tie"]:
if row["home_outcome"] == outcome:
team_stats[home_team][outcome] += 1
# team_stats[home_team]["games"].append(f"{row['datetime']}: {visitor_team}: {outcome[0].upper()} {row['home_runs_for']}-{row['home_runs_against']}")
if row["visitor_outcome"] == outcome:
team_stats[visitor_team][outcome] += 1
# team_stats[visitor_team]["games"].append(f"{row['datetime']}: {home_team}: {outcome[0].upper()} {row['visitor_runs_for']}-{row['visitor_runs_against']}")
team_stats[home_team]["runs_for"] += row["home_runs_for"]
team_stats[home_team]["runs_against"] += row["home_runs_against"]
team_stats[visitor_team]["runs_for"] += row["visitor_runs_for"]
team_stats[visitor_team]["runs_against"] += row["visitor_runs_against"]
# Convert team_stats dictionary to a list of dictionaries
aggregated_data = [{"team": team, **stats} for team, stats in team_stats.items()]
# Sort the list by team name
sorted_aggregated_data = sorted(aggregated_data, key=lambda x: x["win"], reverse=True)
return sorted_aggregated_data
def write_sportspress_csv(data: List[Dict], file_path: Path, only_with_outcome:bool = False):
"""
Writes sports event data to a CSV file in a specific format.
Parameters:
- data (List[Dict]): List of dictionaries where each dictionary represents a sports event.
- file_path (Path): The Path object representing the file path where the CSV file will be created.
- only_with_outcome (bool, optional): If True, only events with outcomes will be included in the CSV. Default is False.
Returns:
None
Example:
>>> data = [...] # List of dictionaries representing sports events
>>> file_path = Path("output.csv")
>>> write_sportspress_csv(data, file_path)
"""
with file_path.open('w') as output_csv_file:
writer = csv.writer(output_csv_file)
fieldnames = [
"Format", #Competitive or Friendly
# "Competition",
"Season",
# "Date Format",
"Date",
"Time",
"Venue",
"Team",
"Results",
"Outcome",
# "Players",
# "Performance",
]
# Write the header
writer.writerow(fieldnames)
# Write the data
for row in data:
if only_with_outcome and not row['has_result']:
continue
writer.writerow(
[
row["datetime"].strftime("%Y/%m/%d"),
row["datetime"].strftime("%H:%M"),
row.get("field", ""),
row["home"],
"|".join([str(row.get(k,"")) for k in [
"home_runs_for_inning_1",
"home_runs_for_inning_2",
"home_runs_for_inning_3",
"home_runs_for_inning_4",
"home_runs_for_inning_5",
"home_runs_for_inning_6",
"home_runs_for_inning_7",
"home_runs_for_inning_8",
"home_runs_for_inning_9",
"home_runs_for_inning_10",
"home_runs_for",
"home_errors",
"home_hits"
]]),
row.get("home_outcome")
]
)
writer.writerow(
[
"",
"",
"",
row["visitor"],
"|".join([str(row.get(k,"")) for k in [
"visitor_runs_for_inning_1",
"visitor_runs_for_inning_2",
"visitor_runs_for_inning_3",
"visitor_runs_for_inning_4",
"visitor_runs_for_inning_5",
"visitor_runs_for_inning_6",
"visitor_runs_for_inning_7",
"visitor_runs_for_inning_8",
"visitor_runs_for_inning_9",
"visitor_runs_for_inning_10",
"visitor_runs_for",
"visitor_errors",
"visitor_hits"
]]),
row.get("visitor_outcome")
]
)

2
requirements.txt Normal file
View File

@@ -0,0 +1,2 @@
typer[all]==0.9.0
python-dateutil==2.8.2

0
tests/__init__.py Normal file
View File

145
tests/test_utils.py Normal file
View File

@@ -0,0 +1,145 @@
import unittest
from pathlib import Path
# from convert_to_sportspress
from convert_to_sportspress.utils import validate_csv_header, normalize_header_key, read_csv, parse_score, is_visitor_home_order_reversed, process_data, aggregate_teams
class TestConvertToSportsPress(unittest.TestCase):
def setUp(self):
# Path to the test CSV file
self.test_csv_path_2009 = Path("data/2009.csv")
def test_validate_csv_header(self):
header = ["Date", "Time", "Field", "Visitor", "Home", "Results", "Results"]
self.assertTrue(validate_csv_header(header))
header = ["Time", "Field", "Visitor", "Home", "Results", "Results"]
self.assertFalse(validate_csv_header(header))
def test_normalize_header_key(self):
self.assertEqual(normalize_header_key("Away"), "visitor")
self.assertEqual(normalize_header_key("Visitor"), "visitor")
self.assertEqual(normalize_header_key("Results"), "results")
self.assertEqual(normalize_header_key("Final Score"), "results")
def test_read_csv(self):
# Assuming that the CSV file has a valid header
with self.subTest("Read CSV data"):
data = read_csv(self.test_csv_path_2009)
self.assertIsInstance(data, list)
self.assertTrue(all(isinstance(row, dict) for row in data))
with self.subTest("Normalized keys"):
normalized_data = read_csv(self.test_csv_path_2009)
self.assertTrue(all("visitor" in row.keys() and "results" in row.keys() for row in normalized_data))
def test_parse_score_visitor_first(self):
with self.subTest('visitor win'):
score_str = "5-3"
expected_result = {
"has_result":True,
"home_outcome":"loss",
"visitor_outcome":"win",
"home_runs_for": 3, "visitor_runs_for": 5,
"home_runs_against": 5, "visitor_runs_against": 3
}
result = parse_score(score_str)
self.assertDictEqual(result, expected_result)
with self.subTest('visitor loss'):
score_str = "3-5"
expected_result = {
"has_result":True,
"home_outcome":"win",
"visitor_outcome":"loss",
"home_runs_for": 5, "visitor_runs_for": 3,
"home_runs_against": 3, "visitor_runs_against": 5
}
result = parse_score(score_str)
self.assertDictEqual(result, expected_result)
def test_parse_score_visitor_first_with_pre_post(self):
score_str = "5-3xxxx"
expected_result = {
"has_result":True,
"home_outcome":"loss",
"visitor_outcome":"win",
"home_runs_for": 3, "visitor_runs_for": 5,
"home_runs_against": 5, "visitor_runs_against": 3,
"post":"xxxx"
}
result = parse_score(score_str)
self.assertEqual(result, expected_result)
# score_str = "xxxx5-3xx"
# expected_result = {"home_runs_for": 3, "visitor_runs_for": 5, "home_runs_against": 5, "visitor_runs_against": 3, "pre":"xxxx", "post":"xx"}
# result = parse_score(score_str)
# self.assertDictEqual(result, expected_result)
def test_parse_score_home_first(self):
score_str = "2-4"
with self.subTest("home loss"):
expected_result = {
"has_result":True,
"home_outcome":"loss",
"visitor_outcome":"win",
"home_runs_for": 2, "visitor_runs_for": 4,
"home_runs_against": 4, "visitor_runs_against": 2
}
score_str = "2-4"
result = parse_score(score_str, reverse_order=True)
self.assertDictEqual(result, expected_result)
with self.subTest("home win"):
expected_result = {
"has_result":True,
"home_outcome":"win",
"visitor_outcome":"loss",
"home_runs_for": 4, "visitor_runs_for": 2,
"home_runs_against": 2, "visitor_runs_against": 4
}
score_str = "4-2"
result = parse_score(score_str, reverse_order=True)
self.assertDictEqual(result, expected_result)
def test_parse_score_invalid_format(self):
score_str = "invalid_format"
expected_result = {'has_result': False, "post":"invalid_format"}
result = parse_score(score_str)
self.assertDictEqual(result, expected_result)
def test_is_visitor_home_order_reversed_true(self):
header = ["date", "time", "field", "visitor", "home", "results", "results"]
result = is_visitor_home_order_reversed(header)
self.assertFalse(result)
def test_is_visitor_home_order_reversed_false(self):
header = ["date", "time", "field", "home", "visitor", "results", "results"]
result = is_visitor_home_order_reversed(header)
self.assertTrue(result)
def test_process_data(self):
# Assuming that the CSV file has a valid header and read_csv is good
data = read_csv(self.test_csv_path_2009)
processed_data = process_data(data)
aggregate_team_data = aggregate_teams(processed_data)
expected_result = [
{"team": "Marlins", "gp": 28, "win": 23, "loss": 5, "tie": 0, "pts": 46, "runs_for": 249, "runs_against": 117},
{"team": "Mets", "gp": 28, "win": 20, "loss": 8, "tie": 0, "pts": 40, "runs_for": 265, "runs_against": 150},
{"team": "Browns", "gp": 28, "win": 17, "loss": 11, "tie": 0, "pts": 34, "runs_for": 221, "runs_against": 201},
{"team": "Yankees", "gp": 28, "win": 15, "loss": 12, "tie": 1, "pts": 31, "runs_for": 189, "runs_against": 163},
{"team": "Rangers", "gp": 28, "win": 15, "loss": 13, "tie": 0, "pts": 30, "runs_for": 203, "runs_against": 188},
{"team": "Hounds", "gp": 28, "win": 14, "loss": 14, "tie": 0, "pts": 28, "runs_for": 181, "runs_against": 161},
{"team": "Electrons", "gp": 28, "win": 13, "loss": 14, "tie": 1, "pts": 27, "runs_for": 168, "runs_against": 185},
{"team": "Vikings", "gp": 28, "win": 12, "loss": 16, "tie": 0, "pts": 24, "runs_for": 201, "runs_against": 229},
{"team": "Athletics", "gp": 28, "win": 8, "loss": 18, "tie": 2, "pts": 18, "runs_for": 157, "runs_against": 258},
{"team": "Red Sox", "gp": 28, "win": 8, "loss": 20, "tie": 0, "pts": 16, "runs_for": 156, "runs_against": 244},
{"team": "Aviators", "gp": 28, "win": 7, "loss": 21, "tie": 0, "pts": 14, "runs_for": 168, "runs_against": 262}
]
for team, expected_dict in [(row['team'], row) for row in expected_result]:
with self.subTest(f'Contains team "{team}"'):
aggregate_team_data_dict = [item for item in aggregate_team_data if item.get('team') == team]
len(aggregate_team_data_dict) == 1
aggregate_team_data_dict = aggregate_team_data_dict[0]
with self.subTest(f'Results of "{team}"'):
self.assertDictContainsSubset(aggregate_team_data_dict, expected_dict)
if __name__ == "__main__":
unittest.main()