Improve schedule scraping and rating computation with better docs

- Add detailed docstrings and comments to build_season_schedule.py scraping/parsing functions
- Enhance compute_ratings.py with clear parameter/docs and refined data handling
- Improve Elo calculation stability and add composite rating output with explained metrics
This commit is contained in:
2025-08-28 15:16:32 -05:00
parent ef11cdbac3
commit 03f87c205b
2 changed files with 246 additions and 49 deletions

View File

@@ -41,24 +41,31 @@ logging.basicConfig(
# ----------------- constants -----------------
UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) SE-Schedule/1.3 Safari/537.36"
HEADERS = {"User-Agent": UA}
PRINT_BASE = "https://www.csyba.com/schedule/print/team_instance/{iid}"
GAME_BASE = "https://www.csyba.com/game/show/{gid}"
HEADERS = {"User-Agent": UA} # HTTP headers with custom User-Agent for requests
PRINT_BASE = "https://www.csyba.com/schedule/print/team_instance/{iid}" # base URL for team-instance printable schedule
GAME_BASE = "https://www.csyba.com/game/show/{gid}" # base URL for game detail page
# Regular expressions for parsing scores, game links, and time strings
SCORE_RE = re.compile(r"\b(\d+)\s*[-]\s*(\d+)\b")
GAME_LINK_RE = re.compile(r"/game/show/(\d+)")
TIME_RE = re.compile(r"\b(\d{1,2}:\d{2})\s*([ap]\.?m\.?|AM|PM)?\b", re.I)
# ----------------- helpers -----------------
def clean(x: str) -> str:
"""Normalize whitespace and strip input string."""
return re.sub(r"\s+", " ", (x or "")).strip()
def slugify(s: str) -> str:
"""Convert string to lowercase slug with words separated by hyphens."""
s = s.lower()
s = re.sub(r"[^a-z0-9]+", "-", s).strip("-")
return s
def norm_name(s: str) -> str:
"""
Normalize team names by lowercasing, removing common words like 'the', 'club',
and stripping punctuation, to help with loose matching.
"""
s = s.lower()
s = re.sub(r"[^a-z0-9 ]+", " ", s)
s = re.sub(r"\b(the|club|team|ll|little league|baseball|softball|youth|athletic|athletics|rec|rec\.)\b", " ", s)
@@ -67,6 +74,7 @@ def norm_name(s: str) -> str:
@dataclass(frozen=True)
class TeamRec:
"""Data class representing a team record with identifying information."""
name: str
slug: str
team_id: str
@@ -74,7 +82,10 @@ class TeamRec:
subseason_id: str
def load_teams(teams_path: str):
"""Load mapping tables from teams.json you provided."""
"""
Load team mapping data from JSON file.
Returns dictionaries keyed by instance_id, slug, and normalized names for lookups.
"""
with open(teams_path, "r", encoding="utf-8") as f:
arr = json.load(f)
by_instance: Dict[str, TeamRec] = {}
@@ -94,7 +105,11 @@ def load_teams(teams_path: str):
return by_instance, by_slug, by_norm
def best_match_team(opponent_text: str, by_slug, by_norm) -> Optional[TeamRec]:
"""Match opponent using slug first, then normalized name, then loose containment."""
"""
Attempt to match the opponent team name to a known team record.
Tries slug first, then normalized name exact match,
then loose containment matching on normalized names.
"""
s = slugify(opponent_text)
if s in by_slug:
return by_slug[s]
@@ -108,8 +123,8 @@ def best_match_team(opponent_text: str, by_slug, by_norm) -> Optional[TeamRec]:
def runs_from_team_pov(result_flag: str, s_a: str, s_b: str):
"""
Team-instance pages are TEAM-FIRST. s_a is THIS team's runs, s_b is opponent runs.
We don't reorder; we only validate with W/L/T if needed.
Parse runs scored by team and opponent, assuming team-first order.
Validate results with result_flag (W/L/T).
"""
if not (s_a.isdigit() and s_b.isdigit()):
return None, None
@@ -122,6 +137,10 @@ def runs_from_team_pov(result_flag: str, s_a: str, s_b: str):
# ----------------- HTTP utils -----------------
def get_soup(url: str, session: Optional[requests.Session] = None, timeout: int = 30) -> Optional[BeautifulSoup]:
"""
Fetch a URL and return a BeautifulSoup parsed document.
Uses a shared requests.Session if provided.
"""
try:
sess = session or requests.Session()
r = sess.get(url, headers=HEADERS, timeout=timeout)
@@ -133,7 +152,10 @@ def get_soup(url: str, session: Optional[requests.Session] = None, timeout: int
# ----------------- scraping -----------------
def parse_printable(instance_id: str, subseason_id: str, session: requests.Session) -> List[dict]:
"""Parse one team-instance printable schedule page into perspective rows."""
"""
Download and parse the team-instance printable schedule page,
extracting a list of game dictionaries from the perspective of that team.
"""
url = PRINT_BASE.format(iid=instance_id) + "?" + urlencode({
"schedule_type": "index",
"subseason": subseason_id,
@@ -148,25 +170,27 @@ def parse_printable(instance_id: str, subseason_id: str, session: requests.Sessi
return []
games = []
# Skip header row; iterate over game rows
for row_idx, tr in enumerate(table.select("tr")[1:], start=1):
tds = tr.select("td")
if len(tds) < 5:
continue
# Cells: Date | Result | Opponent | Location | Status
# Extract text from each relevant cell:
# Date | Result | Opponent | Location | Status
date_txt = clean(tds[0].get_text(" "))
result_txt = clean(tds[1].get_text(" "))
opp_txt = clean(tds[2].get_text(" "))
loc_txt = clean(tds[3].get_text(" "))
status_txt = clean(tds[4].get_text(" "))
# Date → ISO
# Parse date into ISO format (YYYY-MM-DD) if possible
try:
date_iso = dtp.parse(date_txt, fuzzy=True).date().isoformat()
except Exception:
date_iso = date_txt
date_iso = date_txt # leave raw if parsing fails
# Pull a game_id if present (from any link in the row)
# Find game ID from any game/show links in the row, if present
game_id = ""
for a in tr.select("a[href]"):
m = GAME_LINK_RE.search(a.get("href", ""))
@@ -174,19 +198,19 @@ def parse_printable(instance_id: str, subseason_id: str, session: requests.Sessi
game_id = m.group(1)
break
# Extract W/L/T (Result cell)
# Extract W/L/T indicator from Result cell
m_res = re.search(r"\b(W|L|T)\b", result_txt, re.I)
result_flag = m_res.group(1).upper() if m_res else ""
# Extract score from Result cell; if missing, also try Opponent cell
# Extract numeric scores from Result or Opponent cell
m_score = SCORE_RE.search(result_txt) or SCORE_RE.search(opp_txt)
s_a, s_b = (m_score.group(1), m_score.group(2)) if m_score else ("", "")
# Opponent + home/away flag
# Determine if game is away based on '@' prefix in opponent cell
is_away = opp_txt.startswith("@")
opponent_name = opp_txt.lstrip("@").strip()
# Compute team/opp runs (TEAM-FIRST orientation)
# Convert scores to integers with team-first orientation
team_runs, opp_runs = runs_from_team_pov(result_flag, s_a, s_b)
logging.debug(
@@ -214,10 +238,9 @@ def parse_printable(instance_id: str, subseason_id: str, session: requests.Sessi
def fetch_game_time(game_id: str, session: requests.Session) -> Optional[str]:
"""
Fetch the game's local start time from the /game/show/<id> page.
Looks inside the tab with id 'tab_boxscores_content' but also
falls back to scanning the page for common time patterns.
Returns a zero-padded 24h 'HH:MM' string or None if unavailable.
Fetch the start time of a game from its detail page.
Looks inside the boxscores tab or scans text for time patterns.
Returns a 24-hour formatted 'HH:MM' string or None if not found.
"""
if not game_id:
return None
@@ -226,13 +249,13 @@ def fetch_game_time(game_id: str, session: requests.Session) -> Optional[str]:
if not soup:
return None
# Prefer the boxscores tab content
# Prefer boxscores tab content to search for time string
box = soup.select_one("#tab_boxscores_content") or soup.select_one("#tab_boxscore_content")
text = ""
if box:
text = " ".join(box.stripped_strings)
else:
# Fall back to page-wide text (but avoid pulling too much)
# Fall back to main page text with length limit to prevent excessive text processing
main = soup.select_one("div.page") or soup
text = " ".join((main.get_text(" ", strip=True) or "")[:4000].split())
@@ -244,21 +267,19 @@ def fetch_game_time(game_id: str, session: requests.Session) -> Optional[str]:
hhmm = m.group(1)
ampm = (m.group(2) or "").lower().replace(".", "")
try:
# Normalize to 24h HH:MM
# Normalize time to 24h format
from datetime import datetime
if ampm:
dt = datetime.strptime(f"{hhmm} {ampm.upper()}", "%I:%M %p")
else:
# already 24h-ish
dt = datetime.strptime(hhmm, "%H:%M")
return dt.strftime("%H:%M")
except Exception:
# Be forgiving (e.g., "6:00pm" without space)
# Try forgiving parse if combined time/ampm without space
try:
from datetime import datetime
hhmm2 = hhmm
if ampm:
dt = datetime.strptime(f"{hhmm2}{ampm}", "%I:%M%p")
dt = datetime.strptime(f"{hhmm}{ampm}", "%I:%M%p")
return dt.strftime("%H:%M")
except Exception:
logging.debug(f"TIME: could not normalize '{hhmm} {ampm}' for game {game_id}")
@@ -272,25 +293,34 @@ def main(
fetch_time: bool = typer.Option(False, help="Fetch game time from /game/show/<id>"),
sleep: float = typer.Option(0.35, help="Delay between requests (seconds)")
):
"""
Main function to scrape schedules for all teams, merge them,
deduplicate entries (primary by game_id), and output a consolidated CSV.
Optionally fetches start times per game.
"""
# Load teams data and indexes
by_instance, by_slug, by_norm = load_teams(teams)
instance_ids = sorted(by_instance.keys())
# Requests session with custom headers
session = requests.Session()
session.headers.update(HEADERS)
# Scrape all teams
# Scrape all team instance printable schedules
raw: List[dict] = []
for i, iid in enumerate(instance_ids, 1):
logging.info(f"[{i}/{len(instance_ids)}] Fetching schedule for instance {iid}")
raw.extend(parse_printable(iid, subseason, session=session))
time.sleep(sleep) # be polite
# Helper lookups for team records
def rec_from_instance(iid: str) -> Optional[TeamRec]:
return by_instance.get(iid)
def match_opponent(text: str) -> Optional[TeamRec]:
return best_match_team(text, by_slug, by_norm)
# Deduplicate buckets keyed by game_id or fallback composite keys
buckets: Dict[str, dict] = {}
fallback_rows = 0
@@ -313,6 +343,7 @@ def main(
key = f"fb:{row['date']}|{pair[0]}@{pair[1]}|{runs_sig}"
fallback_rows += 1
# Store perspective of one team's view of the game
perspective = {
"team": team_rec,
"opp": opp_rec, # may be None
@@ -336,42 +367,54 @@ def main(
logging.info(f"Used fallback dedupe for {fallback_rows} rows without game_id.")
out_rows = []
time_cache: Dict[str, Optional[str]] = {}
time_cache: Dict[str, Optional[str]] = {} # cache game times to avoid re-fetching
# Merge perspectives and produce consolidated rows
for key, bucket in buckets.items():
p = bucket["persp"]
date = p[0]["date"]
game_id = bucket.get("game_id", "")
# Try to identify home and away perspectives
p_home = next((x for x in p if x["is_away"] is False), None)
p_away = next((x for x in p if x["is_away"] is True), None)
# Home is the team who is not away, else fallback to the other team's opponent
home_team = (p_home["team"] if p_home else (p_away["opp"] if p_away else None))
away_team = (p_away["team"] if p_away else (p_home["opp"] if p_home else None))
def pack_team(rec: Optional[TeamRec], fallback_slug: str):
"""Pack team record to tuple or fallback to slug-based default values."""
if rec:
return rec.slug, rec.instance_id, rec.team_id, rec.name
return fallback_slug, "", "", fallback_slug.replace("-", " ").title()
# Attempt to get runs from home perspective
home_runs = away_runs = None
if p_home and isinstance(p_home["team_runs"], int) and isinstance(p_home["opp_runs"], int):
home_runs = p_home["team_runs"]
away_runs = p_home["opp_runs"]
# Otherwise try away perspective with reversed runs
elif p_away and isinstance(p_away["team_runs"], int) and isinstance(p_away["opp_runs"], int):
away_runs = p_away["team_runs"]
home_runs = p_away["opp_runs"]
# If runs still missing, guess from first perspective, adjusting for is_away
if (home_runs is None or away_runs is None) and p:
one = p[0]
if isinstance(one["team_runs"], int) and isinstance(one["opp_runs"], int):
if one["is_away"]:
away_runs = one["team_runs"]; home_runs = one["opp_runs"]
away_team = one["team"]; home_team = one["opp"] if one["opp"] else home_team
away_runs = one["team_runs"]
home_runs = one["opp_runs"]
away_team = one["team"]
home_team = one["opp"] if one["opp"] else home_team
else:
home_runs = one["team_runs"]; away_runs = one["opp_runs"]
home_team = one["team"]; away_team = one["opp"] if one["opp"] else away_team
home_runs = one["team_runs"]
away_runs = one["opp_runs"]
home_team = one["team"]
away_team = one["opp"] if one["opp"] else away_team
# Fallback guesses for home and away slugs if team data missing
guess_home_fallback = (p_home["team"].slug if p_home and p_home["team"] else
p_away["opp"].slug if p_away and p_away["opp"] else
p[0]["pair"][0])
@@ -382,6 +425,7 @@ def main(
home_slug, home_inst, home_id, home_name = pack_team(home_team, guess_home_fallback)
away_slug, away_inst, away_id, away_name = pack_team(away_team, guess_away_fallback)
# Determine winner and loser slugs based on runs
winner_slug = winner_inst = winner_id = loser_slug = loser_inst = loser_id = ""
if isinstance(home_runs, int) and isinstance(away_runs, int):
if home_runs > away_runs:
@@ -391,10 +435,12 @@ def main(
winner_slug, winner_inst, winner_id = away_slug, away_inst, away_id
loser_slug, loser_inst, loser_id = home_slug, home_inst, home_id
# Consolidate location and status from home or away perspectives
loc = (p_home["location"] if p_home else "") or (p_away["location"] if p_away else "")
status = (p_home["status"] if p_home else "") or (p_away["status"] if p_away else "")
source_urls = sorted({x["source_url"] for x in p})
# Optionally fetch game start time
time_local = ""
if fetch_time and game_id:
if game_id in time_cache:
@@ -403,6 +449,7 @@ def main(
logging.debug(f"TIME: fetching game {game_id}")
tval = fetch_game_time(game_id, session=session)
time_cache[game_id] = tval
# If no time found, wait longer before next request to be polite
if tval is None:
time.sleep(min(sleep * 2, 1.0))
if tval:
@@ -413,6 +460,7 @@ def main(
f"winner={winner_slug or 'TIE'} id={game_id} time={time_local or 'NA'}"
)
# Append consolidated game record for CSV output
out_rows.append({
"date_local": date,
"time_local": time_local,
@@ -431,6 +479,7 @@ def main(
logging.warning("No games produced.")
return
# Define CSV output columns
fieldnames = [
"date_local","time_local",
"home_slug","home_instance","home_id","home_name",
@@ -440,6 +489,7 @@ def main(
"loser_slug","loser_instance","loser_id",
"location","status","game_id","source_urls",
]
# Write consolidated game data to CSV
with open(out, "w", newline="", encoding="utf-8") as f:
w = csv.DictWriter(f, fieldnames=fieldnames)
w.writeheader()