Files
csyba/build_season_schedule.py
Anthony Correa 03f87c205b Improve schedule scraping and rating computation with better docs
- Add detailed docstrings and comments to build_season_schedule.py scraping/parsing functions
- Enhance compute_ratings.py with clear parameter/docs and refined data handling
- Improve Elo calculation stability and add composite rating output with explained metrics
2025-08-28 15:16:32 -05:00

503 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# build_season_schedule.py
#
# Build a deduped season schedule from SportsEngine team-instance printable pages.
# - Assumes team-instance schedule pages are TEAM-FIRST for scores.
# - Determines home/away using the '@' marker on the opponent cell.
# - Deduplicates primarily by game_id (from /game/show/<id> links), otherwise by a fallback key.
# - Optionally fetches each game's time from the /game/show/<id> page ("tab_boxscores_content").
#
# Usage:
# pip install requests beautifulsoup4 python-dateutil
# python build_season_schedule.py --subseason 942425 --teams teams.json --out season_schedule.csv
#
# Example teams.json (array):
# [
# {"teamName":"Carol Stream Cheaties","team_id":"8944347","team_slug":"carol-stream-cheaties","subseason_id":"942425","instance_id":"10119604"},
# ...
# ]
import argparse
import csv
import json
import logging
import re
import time
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
from urllib.parse import urlencode
import requests
from bs4 import BeautifulSoup
from dateutil import parser as dtp
import typer
# ----------------- logging -----------------
logging.basicConfig(
level=logging.INFO, # change to DEBUG for verbose tracing
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%H:%M:%S",
)
# ----------------- constants -----------------
UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) SE-Schedule/1.3 Safari/537.36"
HEADERS = {"User-Agent": UA} # HTTP headers with custom User-Agent for requests
PRINT_BASE = "https://www.csyba.com/schedule/print/team_instance/{iid}" # base URL for team-instance printable schedule
GAME_BASE = "https://www.csyba.com/game/show/{gid}" # base URL for game detail page
# Regular expressions for parsing scores, game links, and time strings
SCORE_RE = re.compile(r"\b(\d+)\s*[-]\s*(\d+)\b")
GAME_LINK_RE = re.compile(r"/game/show/(\d+)")
TIME_RE = re.compile(r"\b(\d{1,2}:\d{2})\s*([ap]\.?m\.?|AM|PM)?\b", re.I)
# ----------------- helpers -----------------
def clean(x: str) -> str:
"""Normalize whitespace and strip input string."""
return re.sub(r"\s+", " ", (x or "")).strip()
def slugify(s: str) -> str:
"""Convert string to lowercase slug with words separated by hyphens."""
s = s.lower()
s = re.sub(r"[^a-z0-9]+", "-", s).strip("-")
return s
def norm_name(s: str) -> str:
"""
Normalize team names by lowercasing, removing common words like 'the', 'club',
and stripping punctuation, to help with loose matching.
"""
s = s.lower()
s = re.sub(r"[^a-z0-9 ]+", " ", s)
s = re.sub(r"\b(the|club|team|ll|little league|baseball|softball|youth|athletic|athletics|rec|rec\.)\b", " ", s)
s = re.sub(r"\s+", " ", s).strip()
return s
@dataclass(frozen=True)
class TeamRec:
"""Data class representing a team record with identifying information."""
name: str
slug: str
team_id: str
instance_id: str
subseason_id: str
def load_teams(teams_path: str):
"""
Load team mapping data from JSON file.
Returns dictionaries keyed by instance_id, slug, and normalized names for lookups.
"""
with open(teams_path, "r", encoding="utf-8") as f:
arr = json.load(f)
by_instance: Dict[str, TeamRec] = {}
by_slug: Dict[str, TeamRec] = {}
by_norm: Dict[str, TeamRec] = {}
for t in arr:
rec = TeamRec(
name=str(t["teamName"]),
slug=str(t["team_slug"]),
team_id=str(t["team_id"]),
instance_id=str(t["instance_id"]),
subseason_id=str(t["subseason_id"]),
)
by_instance[rec.instance_id] = rec
by_slug[rec.slug] = rec
by_norm[norm_name(rec.name)] = rec
return by_instance, by_slug, by_norm
def best_match_team(opponent_text: str, by_slug, by_norm) -> Optional[TeamRec]:
"""
Attempt to match the opponent team name to a known team record.
Tries slug first, then normalized name exact match,
then loose containment matching on normalized names.
"""
s = slugify(opponent_text)
if s in by_slug:
return by_slug[s]
n = norm_name(opponent_text)
if n in by_norm:
return by_norm[n]
for key, rec in by_norm.items():
if key in n or n in key:
return rec
return None
def runs_from_team_pov(result_flag: str, s_a: str, s_b: str):
"""
Parse runs scored by team and opponent, assuming team-first order.
Validate results with result_flag (W/L/T).
"""
if not (s_a.isdigit() and s_b.isdigit()):
return None, None
a, b = int(s_a), int(s_b)
if result_flag == "W" and a <= b:
logging.debug(f"Result=W but team_runs<=opp_runs ({a}-{b}); keeping as-is (team-first).")
if result_flag == "L" and a >= b:
logging.debug(f"Result=L but team_runs>=opp_runs ({a}-{b}); keeping as-is (team-first).")
return a, b
# ----------------- HTTP utils -----------------
def get_soup(url: str, session: Optional[requests.Session] = None, timeout: int = 30) -> Optional[BeautifulSoup]:
"""
Fetch a URL and return a BeautifulSoup parsed document.
Uses a shared requests.Session if provided.
"""
try:
sess = session or requests.Session()
r = sess.get(url, headers=HEADERS, timeout=timeout)
r.raise_for_status()
return BeautifulSoup(r.text, "html.parser")
except Exception as e:
logging.error(f"GET failed {url}: {e}")
return None
# ----------------- scraping -----------------
def parse_printable(instance_id: str, subseason_id: str, session: requests.Session) -> List[dict]:
"""
Download and parse the team-instance printable schedule page,
extracting a list of game dictionaries from the perspective of that team.
"""
url = PRINT_BASE.format(iid=instance_id) + "?" + urlencode({
"schedule_type": "index",
"subseason": subseason_id,
})
soup = get_soup(url, session=session)
if not soup:
return []
table = soup.select_one("table")
if not table:
logging.warning(f"No table found for team_instance={instance_id}")
return []
games = []
# Skip header row; iterate over game rows
for row_idx, tr in enumerate(table.select("tr")[1:], start=1):
tds = tr.select("td")
if len(tds) < 5:
continue
# Extract text from each relevant cell:
# Date | Result | Opponent | Location | Status
date_txt = clean(tds[0].get_text(" "))
result_txt = clean(tds[1].get_text(" "))
opp_txt = clean(tds[2].get_text(" "))
loc_txt = clean(tds[3].get_text(" "))
status_txt = clean(tds[4].get_text(" "))
# Parse date into ISO format (YYYY-MM-DD) if possible
try:
date_iso = dtp.parse(date_txt, fuzzy=True).date().isoformat()
except Exception:
date_iso = date_txt # leave raw if parsing fails
# Find game ID from any game/show links in the row, if present
game_id = ""
for a in tr.select("a[href]"):
m = GAME_LINK_RE.search(a.get("href", ""))
if m:
game_id = m.group(1)
break
# Extract W/L/T indicator from Result cell
m_res = re.search(r"\b(W|L|T)\b", result_txt, re.I)
result_flag = m_res.group(1).upper() if m_res else ""
# Extract numeric scores from Result or Opponent cell
m_score = SCORE_RE.search(result_txt) or SCORE_RE.search(opp_txt)
s_a, s_b = (m_score.group(1), m_score.group(2)) if m_score else ("", "")
# Determine if game is away based on '@' prefix in opponent cell
is_away = opp_txt.startswith("@")
opponent_name = opp_txt.lstrip("@").strip()
# Convert scores to integers with team-first orientation
team_runs, opp_runs = runs_from_team_pov(result_flag, s_a, s_b)
logging.debug(
f"PARSER: inst={instance_id} row={row_idx} date={date_iso} "
f"res={result_flag} scores=({s_a}-{s_b}) away={is_away} "
f"→ team_runs={team_runs}, opp_runs={opp_runs}"
)
games.append({
"team_instance": instance_id,
"game_id": game_id, # may be empty
"date": date_iso,
"result": result_flag, # W/L/T from THIS TEAM's perspective
"team_runs": team_runs,
"opp_runs": opp_runs,
"opponent_name": opponent_name,
"is_away": is_away,
"location": loc_txt,
"status": status_txt,
"source_url": url,
})
logging.info(f"Team {instance_id}: parsed {len(games)} rows")
return games
def fetch_game_time(game_id: str, session: requests.Session) -> Optional[str]:
"""
Fetch the start time of a game from its detail page.
Looks inside the boxscores tab or scans text for time patterns.
Returns a 24-hour formatted 'HH:MM' string or None if not found.
"""
if not game_id:
return None
url = GAME_BASE.format(gid=game_id)
soup = get_soup(url, session=session, timeout=30)
if not soup:
return None
# Prefer boxscores tab content to search for time string
box = soup.select_one("#tab_boxscores_content") or soup.select_one("#tab_boxscore_content")
text = ""
if box:
text = " ".join(box.stripped_strings)
else:
# Fall back to main page text with length limit to prevent excessive text processing
main = soup.select_one("div.page") or soup
text = " ".join((main.get_text(" ", strip=True) or "")[:4000].split())
m = TIME_RE.search(text)
if not m:
logging.debug(f"TIME: no time found in game {game_id}")
return None
hhmm = m.group(1)
ampm = (m.group(2) or "").lower().replace(".", "")
try:
# Normalize time to 24h format
from datetime import datetime
if ampm:
dt = datetime.strptime(f"{hhmm} {ampm.upper()}", "%I:%M %p")
else:
dt = datetime.strptime(hhmm, "%H:%M")
return dt.strftime("%H:%M")
except Exception:
# Try forgiving parse if combined time/ampm without space
try:
from datetime import datetime
if ampm:
dt = datetime.strptime(f"{hhmm}{ampm}", "%I:%M%p")
return dt.strftime("%H:%M")
except Exception:
logging.debug(f"TIME: could not normalize '{hhmm} {ampm}' for game {game_id}")
return None
# ----------------- build & merge -----------------
def main(
subseason: str = typer.Option(..., help="Subseason ID, e.g. 942425"),
teams: str = typer.Option(..., help="Path to teams.json (array with team_id, team_slug, instance_id, teamName)"),
out: str = typer.Option("season_schedule.csv", help="Output CSV path"),
fetch_time: bool = typer.Option(False, help="Fetch game time from /game/show/<id>"),
sleep: float = typer.Option(0.35, help="Delay between requests (seconds)")
):
"""
Main function to scrape schedules for all teams, merge them,
deduplicate entries (primary by game_id), and output a consolidated CSV.
Optionally fetches start times per game.
"""
# Load teams data and indexes
by_instance, by_slug, by_norm = load_teams(teams)
instance_ids = sorted(by_instance.keys())
# Requests session with custom headers
session = requests.Session()
session.headers.update(HEADERS)
# Scrape all team instance printable schedules
raw: List[dict] = []
for i, iid in enumerate(instance_ids, 1):
logging.info(f"[{i}/{len(instance_ids)}] Fetching schedule for instance {iid}")
raw.extend(parse_printable(iid, subseason, session=session))
time.sleep(sleep) # be polite
# Helper lookups for team records
def rec_from_instance(iid: str) -> Optional[TeamRec]:
return by_instance.get(iid)
def match_opponent(text: str) -> Optional[TeamRec]:
return best_match_team(text, by_slug, by_norm)
# Deduplicate buckets keyed by game_id or fallback composite keys
buckets: Dict[str, dict] = {}
fallback_rows = 0
for row in raw:
team_rec = rec_from_instance(row["team_instance"])
if not team_rec:
logging.warning(f"Unknown instance {row['team_instance']}; skipping")
continue
opp_rec = match_opponent(row["opponent_name"])
opp_slug = opp_rec.slug if opp_rec else slugify(row["opponent_name"])
pair = tuple(sorted([team_rec.slug, opp_slug]))
if row["game_id"]:
key = f"id:{row['game_id']}"
else:
runs_sig = ""
if isinstance(row["team_runs"], int) and isinstance(row["opp_runs"], int):
runs_sig = f"{row['team_runs']}-{row['opp_runs']}"
key = f"fb:{row['date']}|{pair[0]}@{pair[1]}|{runs_sig}"
fallback_rows += 1
# Store perspective of one team's view of the game
perspective = {
"team": team_rec,
"opp": opp_rec, # may be None
"is_away": row["is_away"],
"team_runs": row["team_runs"],
"opp_runs": row["opp_runs"],
"location": row["location"],
"status": row["status"],
"source_url": row["source_url"],
"pair": pair,
"date": row["date"],
"game_id": row["game_id"],
}
if key not in buckets:
buckets[key] = {"persp": [perspective], "game_id": row["game_id"]}
else:
buckets[key]["persp"].append(perspective)
if fallback_rows:
logging.info(f"Used fallback dedupe for {fallback_rows} rows without game_id.")
out_rows = []
time_cache: Dict[str, Optional[str]] = {} # cache game times to avoid re-fetching
# Merge perspectives and produce consolidated rows
for key, bucket in buckets.items():
p = bucket["persp"]
date = p[0]["date"]
game_id = bucket.get("game_id", "")
# Try to identify home and away perspectives
p_home = next((x for x in p if x["is_away"] is False), None)
p_away = next((x for x in p if x["is_away"] is True), None)
# Home is the team who is not away, else fallback to the other team's opponent
home_team = (p_home["team"] if p_home else (p_away["opp"] if p_away else None))
away_team = (p_away["team"] if p_away else (p_home["opp"] if p_home else None))
def pack_team(rec: Optional[TeamRec], fallback_slug: str):
"""Pack team record to tuple or fallback to slug-based default values."""
if rec:
return rec.slug, rec.instance_id, rec.team_id, rec.name
return fallback_slug, "", "", fallback_slug.replace("-", " ").title()
# Attempt to get runs from home perspective
home_runs = away_runs = None
if p_home and isinstance(p_home["team_runs"], int) and isinstance(p_home["opp_runs"], int):
home_runs = p_home["team_runs"]
away_runs = p_home["opp_runs"]
# Otherwise try away perspective with reversed runs
elif p_away and isinstance(p_away["team_runs"], int) and isinstance(p_away["opp_runs"], int):
away_runs = p_away["team_runs"]
home_runs = p_away["opp_runs"]
# If runs still missing, guess from first perspective, adjusting for is_away
if (home_runs is None or away_runs is None) and p:
one = p[0]
if isinstance(one["team_runs"], int) and isinstance(one["opp_runs"], int):
if one["is_away"]:
away_runs = one["team_runs"]
home_runs = one["opp_runs"]
away_team = one["team"]
home_team = one["opp"] if one["opp"] else home_team
else:
home_runs = one["team_runs"]
away_runs = one["opp_runs"]
home_team = one["team"]
away_team = one["opp"] if one["opp"] else away_team
# Fallback guesses for home and away slugs if team data missing
guess_home_fallback = (p_home["team"].slug if p_home and p_home["team"] else
p_away["opp"].slug if p_away and p_away["opp"] else
p[0]["pair"][0])
guess_away_fallback = (p_away["team"].slug if p_away and p_away["team"] else
p_home["opp"].slug if p_home and p_home["opp"] else
p[0]["pair"][1])
home_slug, home_inst, home_id, home_name = pack_team(home_team, guess_home_fallback)
away_slug, away_inst, away_id, away_name = pack_team(away_team, guess_away_fallback)
# Determine winner and loser slugs based on runs
winner_slug = winner_inst = winner_id = loser_slug = loser_inst = loser_id = ""
if isinstance(home_runs, int) and isinstance(away_runs, int):
if home_runs > away_runs:
winner_slug, winner_inst, winner_id = home_slug, home_inst, home_id
loser_slug, loser_inst, loser_id = away_slug, away_inst, away_id
elif away_runs > home_runs:
winner_slug, winner_inst, winner_id = away_slug, away_inst, away_id
loser_slug, loser_inst, loser_id = home_slug, home_inst, home_id
# Consolidate location and status from home or away perspectives
loc = (p_home["location"] if p_home else "") or (p_away["location"] if p_away else "")
status = (p_home["status"] if p_home else "") or (p_away["status"] if p_away else "")
source_urls = sorted({x["source_url"] for x in p})
# Optionally fetch game start time
time_local = ""
if fetch_time and game_id:
if game_id in time_cache:
tval = time_cache[game_id]
else:
logging.debug(f"TIME: fetching game {game_id}")
tval = fetch_game_time(game_id, session=session)
time_cache[game_id] = tval
# If no time found, wait longer before next request to be polite
if tval is None:
time.sleep(min(sleep * 2, 1.0))
if tval:
time_local = tval
logging.debug(
f"MERGE: {date} {home_slug}({home_runs}) vs {away_slug}({away_runs}) "
f"winner={winner_slug or 'TIE'} id={game_id} time={time_local or 'NA'}"
)
# Append consolidated game record for CSV output
out_rows.append({
"date_local": date,
"time_local": time_local,
"home_slug": home_slug, "home_instance": home_inst, "home_id": home_id, "home_name": home_name,
"away_slug": away_slug, "away_instance": away_inst, "away_id": away_id, "away_name": away_name,
"home_runs": "" if home_runs is None else home_runs,
"away_runs": "" if away_runs is None else away_runs,
"winner_slug": winner_slug, "winner_instance": winner_inst, "winner_id": winner_id,
"loser_slug": loser_slug, "loser_instance": loser_inst, "loser_id": loser_id,
"location": loc, "status": status,
"game_id": game_id,
"source_urls": " ".join(source_urls),
})
if not out_rows:
logging.warning("No games produced.")
return
# Define CSV output columns
fieldnames = [
"date_local","time_local",
"home_slug","home_instance","home_id","home_name",
"away_slug","away_instance","away_id","away_name",
"home_runs","away_runs",
"winner_slug","winner_instance","winner_id",
"loser_slug","loser_instance","loser_id",
"location","status","game_id","source_urls",
]
# Write consolidated game data to CSV
with open(out, "w", newline="", encoding="utf-8") as f:
w = csv.DictWriter(f, fieldnames=fieldnames)
w.writeheader()
for r in out_rows:
w.writerow(r)
logging.info(f"Wrote {len(out_rows)} games → {out}")
if __name__ == "__main__":
typer.run(main)